diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7853dae7c11b..7056cb3daac6 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -1255,6 +1255,8 @@ title: Importing Utilities - local: internal/time_series_utils title: Utilities for Time Series + - local: internal/rope_utils + title: Rotary Embeddings Utilities title: Internal helpers - sections: - local: reference/environment_variables diff --git a/docs/source/en/internal/rope_utils.md b/docs/source/en/internal/rope_utils.md new file mode 100644 index 000000000000..36f600bfe911 --- /dev/null +++ b/docs/source/en/internal/rope_utils.md @@ -0,0 +1,89 @@ + + +# Utilities for Rotary Embedding + +This page explains how the Rotary Embedding is computed and applied in Transformers and what types of RoPE are supported. + + +## Overview + +Rotary Position Embeddings are a technique used to inject positional information into attention mechanisms without relying on explicit position encodings. +Instead of adding position vectors to token embeddings, RoPE rotates query and key vectors in the complex plane according to their positions enabling relative positional awareness and better extrapolation to unseen sequence lengths. + +The Transformers library provides a flexible and extensible implementation of various RoPE types defined in `[`~modeling_rope_utils.ROPE_VALIDATION_FUNCTIONS`]`, including both the default and scaled variants: + +| Rope Type | Description | +|------------|-------------| +| `"default"` | Standard rotary embedding as in LLaMA. | +| `"linear"` | Linear-scaled RoPE which allows longer context windows. | +| `"dynamic"` | NTK-aware scaling computed by rescaling frequency base (`ΞΈ`) for longer context. | +| `"yarn"` | YaRN scaling variant providing smoother extrapolation and stability. | +| `"longrope"` | [LongRoPE](https://github.com/microsoft/LongRoPE) scaling as in Phi-2 model series. | +| `"llama3"` | RoPE scaling as in Llama3.1. | + + +# Configuration in Model Configs + +To enable and customize rotary embeddings, add a `rope_parameters` field to your model’s configuration file (`config.json`). This field controls the RoPE behavior across model layers. Note that each RoPE variant defines its own set of expected keys and missing keys will raise an error. See the example below which creates a llama config with default RoPE parameters: + + +```python +from transformers import LlamaConfig + +config = LlamaConfig() +config.rope_parameters = { + "rope_type": "default", # type of RoPE to use + "rope_theta": 10000.0 # base frequency parameter +} + +# If we want to apply a scaled RoPE type, we need to pass extra parameters +config.rope_parameters = { + "rope_type": "linear", + "rope_theta": 10000.0, + "factor": 8.0 # scale factor for context extension +} +``` + +## Per-Layer-Type RoPE Configuration + +Some models such as Gemma-3 use different layer types with different attention mechanisms, i.e. "full attention" in some blocks and "sliding-window attention" in others. Transformers supports specifying distinct RoPE parameters per layer type for these models. In this case, `rope_parameters` should be a nested dictionary, where top-level keys correspond to `config.layer_types` and values are per-type RoPE parameters. During model initialization, each decoder layer will automatically look up the matching RoPE configuration based on its declared layer type. + + +```python +from transformers import Gemma3Config + +config = Gemma3Config() +config.rope_parameters = { + "full_attention": { + "rope_type": "dynamic", + "rope_theta": 1000000.0, + "factor": 8.0, + "original_max_position_embeddings": 8096, + }, + "sliding_attention": { + "rope_type": "default", + "rope_theta": 10000.0, + } +} +``` + +# Utilities + +[[autodoc]] RopeParameters + - __call__ + + diff --git a/docs/source/en/modular_transformers.md b/docs/source/en/modular_transformers.md index a341f1b5d799..aa0dd5d05283 100644 --- a/docs/source/en/modular_transformers.md +++ b/docs/source/en/modular_transformers.md @@ -288,7 +288,7 @@ class Olmo2DecoderLayer(OlmoDecoderLayer): output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states diff --git a/examples/modular-transformers/configuration_duplicated_method.py b/examples/modular-transformers/configuration_duplicated_method.py index 534c9fd6616b..e0cf091716b2 100644 --- a/examples/modular-transformers/configuration_duplicated_method.py +++ b/examples/modular-transformers/configuration_duplicated_method.py @@ -5,8 +5,10 @@ # modular_duplicated_method.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class DuplicatedMethodConfig(PreTrainedConfig): @@ -65,45 +67,10 @@ class DuplicatedMethodConfig(PreTrainedConfig): results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'duplicated_method3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'duplicated_method3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'duplicated_method3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'duplicated_method3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -146,28 +113,27 @@ class DuplicatedMethodConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -187,16 +153,17 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/examples/modular-transformers/configuration_my_new_model.py b/examples/modular-transformers/configuration_my_new_model.py index 924e0cccadd7..d34088c42799 100644 --- a/examples/modular-transformers/configuration_my_new_model.py +++ b/examples/modular-transformers/configuration_my_new_model.py @@ -5,8 +5,10 @@ # modular_my_new_model.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class MyNewModelConfig(PreTrainedConfig): @@ -147,38 +149,30 @@ class MyNewModelConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, mlp_bias=True, - head_dim=None, + head_dim: Optional[int] = None, new_param=0, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -196,15 +190,24 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.new_param = new_param diff --git a/examples/modular-transformers/configuration_my_new_model2.py b/examples/modular-transformers/configuration_my_new_model2.py index f8e219f11eb6..8a1415696508 100644 --- a/examples/modular-transformers/configuration_my_new_model2.py +++ b/examples/modular-transformers/configuration_my_new_model2.py @@ -4,9 +4,10 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_my_new_model2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class MyNewModel2Config(PreTrainedConfig): @@ -51,28 +52,27 @@ class MyNewModel2Config(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -92,16 +92,17 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/examples/modular-transformers/modeling_my_new_model2.py b/examples/modular-transformers/modeling_my_new_model2.py index eccb55dd128b..0dd5efe4e89b 100644 --- a/examples/modular-transformers/modeling_my_new_model2.py +++ b/examples/modular-transformers/modeling_my_new_model2.py @@ -156,8 +156,8 @@ def __init__(self, config: MyNewModel2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -207,7 +207,6 @@ def __init__(self, config: MyNewModel2Config, layer_idx: int): self.mlp = MyNewModel2MLP(config) self.input_layernorm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.attention_type = config.layer_types[layer_idx] def forward( self, @@ -217,7 +216,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/examples/modular-transformers/modeling_super.py b/examples/modular-transformers/modeling_super.py index 2f074fb12702..f349978133db 100644 --- a/examples/modular-transformers/modeling_super.py +++ b/examples/modular-transformers/modeling_super.py @@ -51,8 +51,8 @@ class SuperRotaryEmbedding(nn.Module): def __init__(self, config: SuperConfig, device=None): super().__init__() # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + if hasattr(config, "rope_parameters") and isinstance(config.rope_parameters, dict): + self.rope_type = config.rope_parameters.get("rope_type", config.rope_parameters.get("type")) else: self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings @@ -258,7 +258,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 0a44dc6d0c3d..404b46b65b3f 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -440,7 +440,7 @@ _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"] _import_structure["modeling_outputs"] = [] - _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update", "RopeParameters"] _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"] _import_structure["masking_utils"] = ["AttentionMaskInterface"] _import_structure["optimization"] = [ @@ -619,6 +619,7 @@ from .modelcard import ModelCard as ModelCard from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS + from .modeling_rope_utils import RopeParameters as RopeParameters from .modeling_rope_utils import dynamic_rope_update as dynamic_rope_update from .modeling_utils import AttentionInterface as AttentionInterface from .modeling_utils import PreTrainedModel as PreTrainedModel diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 7f186a32437b..fecf9b46bc71 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -422,6 +422,14 @@ def torch_dtype(self, value): logger.warning_once("`torch_dtype` is deprecated! Use `dtype` instead!") self.dtype = value + @property + def rope_scaling(self): + return self.rope_parameters + + @rope_scaling.setter + def rope_scaling(self, value): + self.rope_parameters = value + def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 4e24fffe5fd8..927b437c1bc3 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -14,7 +14,7 @@ import math from functools import wraps -from typing import Optional +from typing import Optional, TypedDict, Union from .configuration_utils import PreTrainedConfig from .utils import is_torch_available, logging @@ -27,6 +27,57 @@ import torch +def standardize_rope_params(config, rope_theta: Optional[Union[float, dict[str, float]]] = None): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + rope_parameters = getattr(config, "rope_parameters", None) + layer_types = getattr(config, "layer_types", None) + if rope_theta is None: + rope_theta = getattr(config, "rope_theta", None) + + # Case 1: one RoPE theat = one RoPE param per model without nesting + if not isinstance(rope_theta, dict): + if rope_parameters is None: + rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} + else: + # BC: if there is a 'type' field, copy it it to 'rope_type'. + rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) + rope_theta = rope_parameters.get("rope_theta") or rope_theta + rope_parameters.update({"rope_theta": rope_theta, "rope_type": rope_type}) + config.rope_parameters = rope_parameters + + # Case 2: different RoPE for each layer as nested dict + else: + rope_parameters_per_layer_type = {} + for layer_type in layer_types: + if rope_parameters is None: + rope_parameters_per_layer_type[layer_type] = { + "rope_type": "default", + "rope_theta": rope_theta[layer_type], + } + else: + is_field_in_new_format = any(layer_type in rope_parameters for layer_type in layer_types) + if not is_field_in_new_format: + curr_rope_type = rope_parameters.get("rope_type", rope_parameters.get("type")) + rope_parameters_per_layer_type[layer_type] = { + **rope_parameters, + "rope_type": curr_rope_type, + "rope_theta": rope_theta[layer_type], + } + else: + curr_rope_type = rope_parameters[layer_type].get( + "rope_type", rope_parameters[layer_type].get("type") + ) + rope_parameters_per_layer_type[layer_type] = { + **rope_parameters[layer_type], + "rope_type": curr_rope_type, + "rope_theta": rope_theta[layer_type], + } + config.rope_parameters = rope_parameters_per_layer_type + + def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -40,62 +91,98 @@ def dynamic_rope_update(rope_forward): The decorated forward pass. """ - def longrope_frequency_update(self, position_ids, device): + def longrope_frequency_update(self, position_ids, device, layer_type=None): """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise.""" seq_len = torch.max(position_ids) + 1 - if hasattr(self.config, "original_max_position_embeddings"): - original_max_position_embeddings = self.config.original_max_position_embeddings + original_max_position_embeddings = getattr( + self.config, "original_max_position_embeddings", self.config.max_position_embeddings + ) + if layer_type is None: + rope_type = self.rope_type + original_inv_freq = self.original_inv_freq + prefix = "" else: - original_max_position_embeddings = self.config.max_position_embeddings + rope_type = self.rope_type[layer_type] + original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq") + prefix = f"{layer_type}_" + if seq_len > original_max_position_embeddings: - if not hasattr(self, "long_inv_freq"): - self.long_inv_freq, _ = self.rope_init_fn( - self.config, device, seq_len=original_max_position_embeddings + 1 + if not hasattr(self, f"{layer_type}_long_inv_freq"): + rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type] + long_inv_freq, _ = rope_init_fn( + self.config, + device, + seq_len=original_max_position_embeddings + 1, + layer_type=layer_type, ) - self.register_buffer("inv_freq", self.long_inv_freq, persistent=False) + self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False) + setattr(self, f"{prefix}long_inv_freq", long_inv_freq) else: # This .to() is needed if the model has been moved to a device after being initialized (because # the buffer is automatically moved, but not the original copy) - self.original_inv_freq = self.original_inv_freq.to(device) - self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + original_inv_freq = original_inv_freq.to(device) + self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False) + setattr(self, f"{prefix}original_inv_freq", original_inv_freq) - def dynamic_frequency_update(self, position_ids, device): + def dynamic_frequency_update(self, position_ids, device, layer_type=None): """ dynamic RoPE layers should recompute `inv_freq` in the following situations: 1 - growing beyond the cached sequence length (allow scaling) 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) """ seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_seq_len_cached: # growth - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation - self.max_seq_len_cached = seq_len + if layer_type is None: + rope_type = self.rope_type + max_seq_len_cached = self.max_seq_len_cached + original_inv_freq = self.original_inv_freq + prefix = "" + else: + rope_type = self.rope_type[layer_type] + max_seq_len_cached = getattr(self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached) + original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq") + prefix = f"{layer_type}_" + + if seq_len > max_seq_len_cached: # growth + rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type] + inv_freq, self.attention_scaling = rope_init_fn( + self.config, + device, + seq_len=seq_len, + layer_type=layer_type, + ) + # TODO joao: may break with compilation + self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False) + setattr(self, f"{layer_type}_max_seq_len_cached", seq_len) - if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + if seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len: # reset # This .to() is needed if the model has been moved to a device after being initialized (because # the buffer is automatically moved, but not the original copy) - self.original_inv_freq = self.original_inv_freq.to(device) - self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) - self.max_seq_len_cached = self.original_max_seq_len + original_inv_freq = original_inv_freq.to(device) + self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False) + setattr(self, f"{prefix}original_inv_freq", original_inv_freq) + setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len) @wraps(rope_forward) - def wrapper(self, x, position_ids): - if "dynamic" in self.rope_type: - dynamic_frequency_update(self, position_ids, device=x.device) - elif self.rope_type == "longrope": - longrope_frequency_update(self, position_ids, device=x.device) - return rope_forward(self, x, position_ids) + def wrapper(self, x, position_ids, layer_type=None): + rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type] + kwargs = {"layer_type": layer_type} if layer_type is not None else {} + if "dynamic" in rope_type: + dynamic_frequency_update(self, position_ids, device=x.device, **kwargs) + elif rope_type == "longrope": + longrope_frequency_update(self, position_ids, device=x.device, **kwargs) + return rope_forward(self, x, position_ids, **kwargs) return wrapper -def _compute_default_rope_parameters( +def _compute_linear_scaling_rope_parameters( config: Optional[PreTrainedConfig] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, + layer_type: Optional[str] = None, ) -> tuple["torch.Tensor", float]: """ - Computes the inverse frequencies according to the original RoPE implementation + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev Args: config ([`~transformers.PreTrainedConfig`]): The model configuration. This function assumes that the config will provide at least the following @@ -120,53 +207,20 @@ def _compute_default_rope_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ - base = config.rope_theta + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + standardize_rope_params(config) + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + factor = rope_parameters_dict["factor"] + + # Gets the default RoPE parameters + base = rope_parameters_dict["rope_theta"] partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) - attention_factor = 1.0 # Unused in this type of RoPE # Compute the inverse frequencies inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)) - return inv_freq, attention_factor - - -def _compute_linear_scaling_rope_parameters( - config: Optional[PreTrainedConfig] = None, - device: Optional["torch.device"] = None, - seq_len: Optional[int] = None, -) -> tuple["torch.Tensor", float]: - """ - Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev - Args: - config ([`~transformers.PreTrainedConfig`]): - The model configuration. This function assumes that the config will provide at least the following - properties: - - * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. - * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. - * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. - - Additionally, this function will make use of the following properties if they are found in the config: - - * head_dim (`int`, *optional*): The size of the key-value heads in the model. If None, this value will be - derived as hidden_size // num_attention_heads. - * partial_rotary_factor (`float`, *optional*): If less than 1.0, inverse frequencies will be returned for - the first fraction of the head_dim. Defaults to 1.0. - device (`torch.device`): - The device to use for initialization of the inverse frequencies. - seq_len (`int`, *optional*): - The current sequence length. Unused for this type of RoPE. - - Returns: - Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the - post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). - """ - factor = config.rope_scaling["factor"] - - # Gets the default RoPE parameters - inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len) # Then applies linear scaling to the frequencies. # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so @@ -179,6 +233,7 @@ def _compute_dynamic_ntk_parameters( config: Optional[PreTrainedConfig] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, + layer_type: Optional[str] = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla @@ -193,7 +248,7 @@ def _compute_dynamic_ntk_parameters( * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. * max_position_embeddings (`int`): The default sequence length used to update the dynamic RoPE at inference time - * rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor` + * rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which `factor` will be accessed. The value of `factor` is used to determine the new base frequency, along with the current sequence length (seq_len), the maximum positional embeddings (max_position_embeddings), and the computed dimensionality (dim) of the rotary embeddings. If seq_len <= max_position_embeddings, this @@ -216,14 +271,17 @@ def _compute_dynamic_ntk_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ - # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling - base = config.rope_theta - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + standardize_rope_params(config) + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + + base = rope_parameters_dict["rope_theta"] + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) max_position_embeddings = config.max_position_embeddings - factor = config.rope_scaling["factor"] - + factor = rope_parameters_dict["factor"] attention_factor = 1.0 # Unused in this type of RoPE # seq_len: default to max_position_embeddings, e.g. at init time @@ -244,7 +302,10 @@ def _compute_dynamic_ntk_parameters( def _compute_yarn_parameters( - config: PreTrainedConfig, device: "torch.device", seq_len: Optional[int] = None + config: PreTrainedConfig, + device: "torch.device", + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies with NTK scaling. Please refer to the @@ -259,7 +320,7 @@ def _compute_yarn_parameters( * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. * max_position_embeddings (`int`): The maximum length of the positional embeddings. - * rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following + * rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following keys will be accessed: * `attention_factor` (`float`, *optional*): The scaling factor to be applied to the computed cos/sin. If None, the value is inferred from `factor`, `mscale`, and `mscale_all_dim` as avaialble. @@ -298,18 +359,28 @@ def _compute_yarn_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + standardize_rope_params(config) + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters - base = config.rope_theta - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + base = rope_parameters_dict["rope_theta"] + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) - factor = config.rope_scaling["factor"] - attention_factor = config.rope_scaling.get("attention_factor") - mscale = config.rope_scaling.get("mscale") - mscale_all_dim = config.rope_scaling.get("mscale_all_dim") - original_max_position_embeddings = ( - config.rope_scaling.get("original_max_position_embeddings") or config.max_position_embeddings - ) + + factor = rope_parameters_dict["factor"] + attention_factor = rope_parameters_dict.get("attention_factor") + mscale = rope_parameters_dict.get("mscale") + mscale_all_dim = rope_parameters_dict.get("mscale_all_dim") + + # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if "original_max_position_embeddings" in rope_parameters_dict: + original_max_position_embeddings = rope_parameters_dict["original_max_position_embeddings"] + factor = config.max_position_embeddings / original_max_position_embeddings + else: + original_max_position_embeddings = config.max_position_embeddings def get_mscale(scale, mscale=1): if scale <= 1: @@ -324,9 +395,9 @@ def get_mscale(scale, mscale=1): attention_factor = get_mscale(factor) # Optional config options - # beta_fast/beta_slow: as suggested in the paper, default to 32 and 1 respectively - beta_fast = config.rope_scaling.get("beta_fast") or 32 - beta_slow = config.rope_scaling.get("beta_slow") or 1 + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = rope_parameters_dict.get("beta_fast") or 32 + beta_slow = rope_parameters_dict.get("beta_slow") or 1 # Compute the inverse frequencies def find_correction_dim(num_rotations, dim, base, max_position_embeddings): @@ -356,7 +427,7 @@ def linear_ramp_factor(min, max, dim): inv_freq_extrapolation = 1.0 / pos_freqs inv_freq_interpolation = 1.0 / (factor * pos_freqs) - truncate = config.rope_scaling.get("truncate", True) + truncate = config.rope_parameters.get("truncate", True) low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings, truncate) # Get n-dimensional rotational scaling corrected for extrapolation @@ -369,7 +440,10 @@ def linear_ramp_factor(min, max, dim): def _compute_longrope_parameters( - config: PreTrainedConfig, device: "torch.device", seq_len: Optional[int] = None + config: PreTrainedConfig, + device: "torch.device", + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies with LongRoPE scaling. Please refer to the @@ -386,7 +460,7 @@ def _compute_longrope_parameters( * max_position_embeddings (`int`): The maximum length of the positional embeddings. * original_max_position_embeddings (`int`, *optional*): The original max position embeddings used during pretraining. If not provided, defaults to `max_position_embeddings`. - * rope_scaling (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys + * rope_parameters (`dict[str, float]`): The standard RoPE scaling parameters, from which the following keys will be accessed: * `attention_factor` (`float`, *optional*): The scaling factor to be applied on the attention computation. If unspecified, it defaults to value recommended by the implementation, inferred from @@ -414,15 +488,20 @@ def _compute_longrope_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ - # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling - base = config.rope_theta - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + standardize_rope_params(config) + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + + base = rope_parameters_dict["rope_theta"] + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) - long_factor = config.rope_scaling["long_factor"] - short_factor = config.rope_scaling["short_factor"] - factor = config.rope_scaling.get("factor") - attention_factor = config.rope_scaling.get("attention_factor") + + long_factor = rope_parameters_dict["long_factor"] + short_factor = rope_parameters_dict["short_factor"] + factor = rope_parameters_dict.get("factor") + attention_factor = rope_parameters_dict.get("attention_factor") # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two @@ -451,7 +530,10 @@ def _compute_longrope_parameters( def _compute_llama3_parameters( - config: PreTrainedConfig, device: "torch.device", seq_len: Optional[int] = None + config: PreTrainedConfig, + device: "torch.device", + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, ) -> tuple["torch.Tensor", float]: """ Computes the inverse frequencies for llama 3.1. @@ -464,7 +546,7 @@ def _compute_llama3_parameters( * rope_theta (`float`): The base wavelength from which the inverse frequencies will be derived. * hidden_size (`int`): The numerator when deriving a head_dim, if not provided directly. * num_attention_heads (`int`): The denominator when deriving a head_dim, if not provided directly. - * rope_scaling (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following + * rope_parameters (`dict[str, float | int]`): The standard RoPE scaling parameters, from which the following keys will be accessed: * `factor` (`float`, *optional*): The scaling factor applied to the inverse frequencies when 1) the wavelength is greater than `low_freq_wavelen` prior to smoothing, and 2) to all inverse frequencies @@ -491,13 +573,24 @@ def _compute_llama3_parameters( Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the post-processing scaling factor applied to the computed cos/sin. """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + standardize_rope_params(config) + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + # Gets the default RoPE parameters - inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len) + base = rope_parameters_dict["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)) - factor = config.rope_scaling["factor"] # `8` in the original implementation - low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation - high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation - old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation + factor = rope_parameters_dict["factor"] # `8` in the original implementation + low_freq_factor = rope_parameters_dict["low_freq_factor"] # `1` in the original implementation + high_freq_factor = rope_parameters_dict["high_freq_factor"] # `4` in the original implementation + old_context_len = rope_parameters_dict["original_max_position_embeddings"] # `8192` in the original implementation low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -516,10 +609,9 @@ def _compute_llama3_parameters( # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters -# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# from the model config. You can append new {'rope_type': callable} pairs to this rope_parameters to enable custom RoPE # parameterizations, as long as the callable has the same signature. ROPE_INIT_FUNCTIONS = { - "default": _compute_default_rope_parameters, "linear": _compute_linear_scaling_rope_parameters, "dynamic": _compute_dynamic_ntk_parameters, "yarn": _compute_yarn_parameters, @@ -535,7 +627,7 @@ def _check_received_keys( optional_keys: Optional[set] = None, ignore_keys: Optional[set] = None, ): - """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + """Compare the received keys in `config.rope_parameters` against the expected and optional keys""" # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present if "type" in received_keys: received_keys -= {"type"} @@ -547,54 +639,57 @@ def _check_received_keys( missing_keys = required_keys - received_keys if missing_keys: - raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}") if optional_keys is not None: unused_keys = received_keys - required_keys - optional_keys else: unused_keys = received_keys - required_keys if unused_keys: - logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}") -def _validate_default_rope_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type"} - received_keys = set(rope_scaling.keys()) +def _validate_default_rope_parameters( + rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None +): + required_keys = {"rope_type", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) -def _validate_linear_scaling_rope_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type", "factor"} - received_keys = set(rope_scaling.keys()) +def _validate_linear_scaling_rope_parameters( + rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None +): + required_keys = {"rope_type", "factor", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - factor = rope_scaling["factor"] + factor = rope_parameters["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") -def _validate_dynamic_scaling_rope_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type", "factor"} +def _validate_dynamic_scaling_rope_parameters( + rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None +): # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"original_max_position_embeddings"} - received_keys = set(rope_scaling.keys()) + required_keys = {"rope_type", "factor"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - factor = rope_scaling["factor"] + factor = rope_parameters["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") -def _validate_yarn_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type", "factor"} +def _validate_yarn_parameters( + rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None +): + required_keys = {"rope_type", "factor", "rope_theta"} optional_keys = { "attention_factor", "beta_fast", @@ -602,143 +697,148 @@ def _validate_yarn_parameters(config: PreTrainedConfig, ignore_keys: Optional[se "original_max_position_embeddings", "mscale", "mscale_all_dim", - "truncate", } - received_keys = set(rope_scaling.keys()) + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - factor = rope_scaling["factor"] + factor = rope_parameters["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - attention_factor = rope_scaling.get("attention_factor") + attention_factor = rope_parameters.get("attention_factor") if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): logger.warning( - f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) - beta_fast = rope_scaling.get("beta_fast") + beta_fast = rope_parameters.get("beta_fast") if beta_fast is not None and not isinstance(beta_fast, float): - logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") - beta_slow = rope_scaling.get("beta_slow") + logger.warning(f"`rope_parameters`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_parameters.get("beta_slow") if beta_slow is not None and not isinstance(beta_slow, float): - logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + logger.warning(f"`rope_parameters`'s beta_slow field must be a float, got {beta_slow}") if (beta_fast or 32) < (beta_slow or 1): logger.warning( - f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" ) - # Models should set `config.rope_scaling["original_max_position_embeddings"]` to their original (pre-yarn) context + # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context # length, with `config.max_position_embeddings` corresponding to their post-yarn context length. # However, for BC purposes, we allow the former to be unset. - original_max_position_embeddings = config.rope_scaling.get("original_max_position_embeddings") + original_max_position_embeddings = config.rope_parameters.get("original_max_position_embeddings") if original_max_position_embeddings is not None: # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. implicit_factor = config.max_position_embeddings / original_max_position_embeddings if implicit_factor != factor: logger.warning_once( - f"The explicitly set RoPE scaling factor (config.rope_scaling['factor'] = {factor}) does not match " + f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match " "the ratio implicitly set by other parameters (implicit factor = " "post-yarn context length / pre-yarn context length = " - "config.max_position_embeddings / config.rope_scaling['original_max_position_embeddings'] = " + "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = " f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected " "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config." ) - # No `config.rope_scaling["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the + # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the # pre-yarn or the post-yarn context length? # BC: we assume it is the pre-yarn context length. else: logger.warning_once( - "config.rope_scaling['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " + "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect " "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * " "factor) -- we recommend updating both fields for optimal downstream model usage." ) -def _validate_longrope_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type", "short_factor", "long_factor"} +def _validate_longrope_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} - received_keys = set(rope_scaling.keys()) + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) - short_factor = rope_scaling.get("short_factor") + short_factor = rope_parameters.get("short_factor") if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): - logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + logger.warning(f"`rope_parameters`'s short_factor field must be a list of numbers, got {short_factor}") if len(short_factor) != dim // 2: - logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + logger.warning(f"`rope_parameters`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") - long_factor = rope_scaling.get("long_factor") + long_factor = rope_parameters.get("long_factor") if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): - logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + logger.warning(f"`rope_parameters`'s long_factor field must be a list of numbers, got {long_factor}") if len(long_factor) != dim // 2: - logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + logger.warning(f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over - # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is # unique to longrope (= undesirable) if hasattr(config, "original_max_position_embeddings"): logger.warning_once( "This model has set a `original_max_position_embeddings` field, to be used together with " - "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`" "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " "as it is compatible with most model architectures." ) else: - factor = rope_scaling.get("factor") + factor = rope_parameters.get("factor") if factor is None: - logger.warning("Missing required keys in `rope_scaling`: 'factor'") + logger.warning("Missing required keys in `rope_parameters`: 'factor'") elif not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - attention_factor = rope_scaling.get("attention_factor") + attention_factor = rope_parameters.get("attention_factor") if attention_factor is not None: if not isinstance(attention_factor, float) or attention_factor < 0.0: logger.warning( - f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) -def _validate_llama3_parameters(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - rope_scaling = config.rope_scaling - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" - required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} - received_keys = set(rope_scaling.keys()) +def _validate_llama3_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): + required_keys = { + "rope_type", + "factor", + "original_max_position_embeddings", + "low_freq_factor", + "high_freq_factor", + "rope_theta", + } + rope_type = rope_parameters["rope_type"] + received_keys = set(rope_parameters.keys()) _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - factor = rope_scaling["factor"] + factor = rope_parameters["factor"] if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - low_freq_factor = rope_scaling["low_freq_factor"] - high_freq_factor = rope_scaling["high_freq_factor"] + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] if low_freq_factor is None or not isinstance(low_freq_factor, float): - logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + logger.warning(f"`rope_parameters`'s low_freq_factor field must be a float, got {low_freq_factor}") if high_freq_factor is None or not isinstance(high_freq_factor, float): - logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + logger.warning(f"`rope_parameters`'s high_freq_factor field must be a float, got {high_freq_factor}") if high_freq_factor <= low_freq_factor: logger.warning( - "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + "`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" f"{high_freq_factor} and low_freq_factor={low_freq_factor}" ) - original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): logger.warning( - "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " + "`rope_parameters`'s original_max_position_embeddings field must be an integer, got " f"{original_max_position_embeddings}" ) if original_max_position_embeddings >= config.max_position_embeddings: logger.warning( - "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + "`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got " f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" ) @@ -758,16 +858,80 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] """ Validate the RoPE config arguments, given a `PreTrainedConfig` object """ - rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PreTrainedConfig` - if rope_scaling is None: + rope_parameters_dict = getattr(config, "rope_parameters", None) # not a default parameter in `PreTrainedConfig` + if rope_parameters_dict is None: return - # BC: "rope_type" was originally "type" - rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default")) - validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) - if validation_fn is not None: - validation_fn(config, ignore_keys=ignore_keys) + if getattr(config, "layer_types", None) is not None and all( + key in config.layer_types for key in rope_parameters_dict.keys() + ): + pass else: - logger.warning( - f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" - ) + rope_parameters_dict = {"full_attention": rope_parameters_dict} + + for rope_parameters in rope_parameters_dict.values(): + rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + + rope_parameters["rope_type"] = rope_type + # BC: "rope_theta" was originally saved in config + rope_parameters["rope_theta"] = rope_parameters.get("rope_theta", getattr(config, "rope_theta", None)) + + if validation_fn is not None: + validation_fn(rope_parameters, config=config, ignore_keys=ignore_keys) + else: + logger.warning( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) + + +class RopeParameters(TypedDict): + """ + Args: + rope_theta (`float`): + The base period of the RoPE embeddings. + rope_type (`str`, *optional*, defaults to "default"): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + factor (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + original_max_position_embeddings (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + attention_factor (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + beta_fast (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + beta_slow (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + short_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + long_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + low_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + high_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + """ + + rope_theta: float + rope_type: Optional[str] + factor: Optional[float] + original_max_position_embeddings: Optional[int] + attention_factor: Optional[float] + beta_fast: Optional[float] + beta_slow: Optional[float] + short_factor: Optional[list[float]] + long_factor: Optional[list[float]] + low_freq_factor: Optional[float] + high_freq_factor: Optional[float] diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 3a61b780c80d..92eef2778134 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -19,9 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class ApertusConfig(PreTrainedConfig): @@ -74,45 +75,10 @@ class ApertusConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 12000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -150,31 +116,31 @@ class ApertusConfig(PreTrainedConfig): def __init__( self, - vocab_size=131072, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="xielu", - max_position_embeddings=65536, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=3, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=12000000.0, - rope_scaling={ + vocab_size: Optional[int] = 131072, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "xielu", + max_position_embeddings: Optional[int] = 65536, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 3, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters] = { "rope_type": "llama3", + "rope_theta": 12000000.0, "factor": 8.0, "original_max_position_embeddings": 8192, "low_freq_factor": 1.0, "high_freq_factor": 4.0, }, - attention_bias=False, - attention_dropout=0.0, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -193,14 +159,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 12000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/apertus/modeling_apertus.py b/src/transformers/models/apertus/modeling_apertus.py index 46bb78e58b1a..e92e87a3c280 100644 --- a/src/transformers/models/apertus/modeling_apertus.py +++ b/src/transformers/models/apertus/modeling_apertus.py @@ -80,20 +80,49 @@ class ApertusRotaryEmbedding(nn.Module): def __init__(self, config: ApertusConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ApertusConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -378,16 +407,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index 37cd61da3843..5f5ba8ca4a4e 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -20,6 +20,7 @@ from torch import nn from ...cache_utils import Cache +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -92,45 +93,10 @@ class ApertusConfig(LlamaConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 12000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -162,31 +128,31 @@ class ApertusConfig(LlamaConfig): def __init__( self, - vocab_size=131072, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="xielu", - max_position_embeddings=65536, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=3, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=12000000.0, - rope_scaling={ + vocab_size: Optional[int] = 131072, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "xielu", + max_position_embeddings: Optional[int] = 65536, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 3, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters] = { "rope_type": "llama3", + "rope_theta": 12000000.0, "factor": 8.0, "original_max_position_embeddings": 8192, "low_freq_factor": 1.0, "high_freq_factor": 4.0, }, - attention_bias=False, - attention_dropout=0.0, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, **kwargs, ): super().__init__( @@ -205,8 +171,7 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, **kwargs, @@ -215,6 +180,11 @@ def __init__( del self.mlp_bias del self.head_dim + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 12000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class ApertusMLP(NemotronMLP): def __init__(self, config): diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index 57c745629d66..b4e23ffb3b8f 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -19,8 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class ArceeConfig(PreTrainedConfig): @@ -75,30 +77,10 @@ class ArceeConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'yarn'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'yarn'. The original max position embeddings used during pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn'. The scaling factor to be applied on the attention computation. If unspecified, - it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -139,27 +121,26 @@ class ArceeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=2560, - intermediate_size=18432, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="relu2", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=128000, - eos_token_id=128001, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 18432, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128001, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -178,16 +159,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/arcee/modeling_arcee.py b/src/transformers/models/arcee/modeling_arcee.py index c87a2f0fcadc..619e72b7a11b 100644 --- a/src/transformers/models/arcee/modeling_arcee.py +++ b/src/transformers/models/arcee/modeling_arcee.py @@ -87,20 +87,49 @@ class ArceeRotaryEmbedding(nn.Module): def __init__(self, config: ArceeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ArceeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -219,8 +248,8 @@ def __init__(self, config: ArceeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -279,7 +308,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -383,16 +412,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/arcee/modular_arcee.py b/src/transformers/models/arcee/modular_arcee.py index 3fb38fcb4ce1..cb75888957d8 100644 --- a/src/transformers/models/arcee/modular_arcee.py +++ b/src/transformers/models/arcee/modular_arcee.py @@ -14,8 +14,11 @@ # limitations under the License. """PyTorch Arcee model.""" +from typing import Optional + from transformers.utils import auto_docstring, logging +from ...modeling_rope_utils import RopeParameters from ..llama.configuration_llama import LlamaConfig from ..llama.modeling_llama import ( LlamaForCausalLM, @@ -81,30 +84,10 @@ class ArceeConfig(LlamaConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'yarn'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'yarn'. The original max position embeddings used during pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn'. The scaling factor to be applied on the attention computation. If unspecified, - it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -139,27 +122,26 @@ class ArceeConfig(LlamaConfig): def __init__( self, - vocab_size=32000, - hidden_size=2560, - intermediate_size=18432, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="relu2", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=128000, - eos_token_id=128001, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 18432, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128001, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): super().__init__( @@ -178,8 +160,7 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, mlp_bias=mlp_bias, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 8aed0d4a812d..78669c78bcbb 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ..auto import CONFIG_MAPPING, AutoConfig @@ -77,45 +77,10 @@ class AriaTextConfig(PreTrainedConfig): results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -153,28 +118,27 @@ class AriaTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, intermediate_size: int = 4096, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, pad_token_id=2, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, moe_num_experts: int = 8, moe_topk: int = 2, moe_num_shared_experts: int = 2, @@ -201,16 +165,17 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py index bea7748e7bb1..a6547a6aef5a 100644 --- a/src/transformers/models/aria/modeling_aria.py +++ b/src/transformers/models/aria/modeling_aria.py @@ -471,8 +471,8 @@ def __init__(self, config: AriaTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -542,7 +542,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -618,20 +618,49 @@ class AriaTextRotaryEmbedding(nn.Module): def __init__(self, config: AriaTextConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[AriaTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -708,16 +737,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 4853ef361eb8..042f157cbee4 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -147,45 +147,10 @@ class AriaTextConfig(LlamaConfig): results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index dd31dba59498..07fd3eaa1aab 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -14,7 +14,10 @@ # limitations under the License. """Bamba model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -102,7 +105,10 @@ class BambaConfig(PreTrainedConfig): Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block z_loss_coefficient (`float`, *optional*, defaults to 0.0): Coefficient for auxiliary z-loss used to control logit growth during training - + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. """ model_type = "bamba" @@ -110,34 +116,35 @@ class BambaConfig(PreTrainedConfig): def __init__( self, - vocab_size=128000, - tie_word_embeddings=False, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - num_logits_to_keep=1, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - max_position_embeddings=262144, - attention_dropout=0.0, - attn_layer_indices=None, - mamba_n_heads=128, - mamba_d_head="auto", - mamba_n_groups=1, - mamba_d_state=256, - mamba_d_conv=4, - mamba_expand=2, - mamba_chunk_size=256, - mamba_conv_bias=True, - mamba_proj_bias=False, - z_loss_coefficient=0.0, + vocab_size: Optional[int] = 128000, + tie_word_embeddings: Optional[bool] = False, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + num_logits_to_keep: Optional[int] = 1, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + max_position_embeddings: Optional[int] = 262144, + attention_dropout: Optional[float] = 0.0, + attn_layer_indices: Optional[list[int]] = None, + mamba_n_heads: Optional[int] = 128, + mamba_d_head: Optional[str] = "auto", + mamba_n_groups: Optional[int] = 1, + mamba_d_state: Optional[int] = 256, + mamba_d_conv: Optional[int] = 4, + mamba_expand: Optional[int] = 2, + mamba_chunk_size: Optional[int] = 256, + mamba_conv_bias: Optional[bool] = True, + mamba_proj_bias: Optional[bool] = False, + z_loss_coefficient: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters] = None, **kwargs, ): self.vocab_size = vocab_size @@ -164,9 +171,15 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.attn_layer_indices = attn_layer_indices - self.rope_theta = 10000.0 - self.rope_scaling = None + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` self.partial_rotary_factor = 0.5 + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py index 2d7979dbd732..eddbe7763e66 100644 --- a/src/transformers/models/bamba/modeling_bamba.py +++ b/src/transformers/models/bamba/modeling_bamba.py @@ -198,20 +198,49 @@ class BambaRotaryEmbedding(nn.Module): def __init__(self, config: BambaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[BambaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -344,8 +373,8 @@ def __init__(self, config: BambaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -1015,7 +1044,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[BambaFlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -1174,9 +1203,7 @@ def forward( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py index 85b6fed82efb..b578eb1ce220 100644 --- a/src/transformers/models/bamba/modular_bamba.py +++ b/src/transformers/models/bamba/modular_bamba.py @@ -729,7 +729,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[BambaFlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -888,9 +888,7 @@ def forward( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 5e467443f28c..1bcf84d0c6c4 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -13,7 +13,10 @@ # See the License for the specific language governing permissions and """BitNet model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -70,12 +73,14 @@ class BitNetConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. ```python >>> from transformers import BitNetModel, BitNetConfig @@ -95,24 +100,24 @@ class BitNetConfig(PreTrainedConfig): def __init__( self, - vocab_size=128256, - hidden_size=2560, - intermediate_size=6912, - num_hidden_layers=30, - num_attention_heads=20, - num_key_value_heads=5, - hidden_act="relu2", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=128000, - eos_token_id=128001, - tie_word_embeddings=False, - rope_theta=500000.0, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 128256, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 6912, + num_hidden_layers: Optional[int] = 30, + num_attention_heads: Optional[int] = 20, + num_key_value_heads: Optional[int] = 5, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128001, + tie_word_embeddings: Optional[bool] = False, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[str] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -131,9 +136,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bitnet/modeling_bitnet.py b/src/transformers/models/bitnet/modeling_bitnet.py index 56497a892bdc..d3972946a203 100644 --- a/src/transformers/models/bitnet/modeling_bitnet.py +++ b/src/transformers/models/bitnet/modeling_bitnet.py @@ -242,7 +242,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -273,20 +273,49 @@ class BitNetRotaryEmbedding(nn.Module): def __init__(self, config: BitNetConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[BitNetConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -382,16 +411,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 35c087cc82ba..b20ae8c6dad3 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -14,7 +14,10 @@ # limitations under the License. """Blt model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -30,22 +33,21 @@ class BltLocalEncoderConfig(PreTrainedConfig): def __init__( self, - vocab_size=260, - cross_attn_all_layers=False, - cross_attn_k=2, - hidden_size_global=2048, - hidden_size=1024, - num_attention_heads=16, - num_key_value_heads=None, - num_hidden_layers=1, - rms_norm_eps=1e-5, - dropout=0.0, - max_position_embeddings=24576, - rope_theta=500000.0, - rope_scaling=None, - hidden_act="silu", - intermediate_size=2816, - initializer_range=0.02, + vocab_size: Optional[int] = 260, + cross_attn_all_layers: Optional[bool] = False, + cross_attn_k: Optional[int] = 2, + hidden_size_global: Optional[int] = 2048, + hidden_size: Optional[int] = 1024, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = None, + num_hidden_layers: Optional[int] = 1, + rms_norm_eps: Optional[float] = 1e-5, + dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 24576, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + hidden_act: Optional[str] = "silu", + intermediate_size: Optional[int] = 2816, + initializer_range: Optional[float] = 0.02, **kwargs, ): self.vocab_size = vocab_size @@ -61,10 +63,16 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.dropout = dropout self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.hidden_act = hidden_act self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -80,22 +88,21 @@ class BltLocalDecoderConfig(PreTrainedConfig): def __init__( self, - vocab_size=260, - cross_attn_all_layers=True, - cross_attn_k=2, - hidden_size_global=2048, - hidden_size=1024, - num_attention_heads=16, - num_key_value_heads=None, - num_hidden_layers=9, - rms_norm_eps=1e-5, - dropout=0.0, - max_position_embeddings=24576, - rope_theta=500000.0, - rope_scaling=None, - hidden_act="silu", - intermediate_size=2816, - initializer_range=0.02, + vocab_size: Optional[int] = 260, + cross_attn_all_layers: Optional[bool] = True, + cross_attn_k: Optional[int] = 2, + hidden_size_global: Optional[int] = 2048, + hidden_size: Optional[int] = 1024, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = None, + num_hidden_layers: Optional[int] = 9, + rms_norm_eps: Optional[float] = 1e-5, + dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 24576, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + hidden_act: Optional[str] = "silu", + intermediate_size: Optional[int] = 2816, + initializer_range: Optional[float] = 0.02, **kwargs, ): self.vocab_size = vocab_size @@ -111,10 +118,16 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.dropout = dropout self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.hidden_act = hidden_act self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -130,18 +143,17 @@ class BltGlobalTransformerConfig(PreTrainedConfig): def __init__( self, - hidden_size=2048, - num_attention_heads=16, - num_key_value_heads=None, - num_hidden_layers=25, - rms_norm_eps=1e-5, - dropout=0.0, - max_position_embeddings=4096, - rope_theta=500000.0, - rope_scaling=None, - hidden_act="silu", - intermediate_size=5632, - initializer_range=0.02, + hidden_size: Optional[int] = 2048, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = None, + num_hidden_layers: Optional[int] = 25, + rms_norm_eps: Optional[float] = 1e-5, + dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 4096, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + hidden_act: Optional[str] = "silu", + intermediate_size: Optional[int] = 5632, + initializer_range: Optional[float] = 0.02, **kwargs, ): self.hidden_size = hidden_size @@ -153,10 +165,15 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.dropout = dropout self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.hidden_act = hidden_act self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -168,55 +185,54 @@ class BltPatcherConfig(PreTrainedConfig): Configuration class for the Blt Patcher/Entropy model component. Args: - vocab_size (`int`, *optional*, defaults to 260): - Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling the patcher model. - hidden_size (`int`, *optional*, defaults to 768): - Dimension of the hidden representations. - num_hidden_layers (`int`, *optional*, defaults to 14): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 12): - Number of attention heads for each attention layer in the Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details, check out [this - paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to - `num_attention_heads`. - max_position_embeddings (`int`, *optional*, defaults to 8192): - The maximum sequence length that this model might ever be used with. - rms_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the rms normalization layers. - dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - intermediate_size (`int`, *optional*, defaults to 2048): - Dimension of the MLP representations. - rope_scaling (`dict`, *optional*): - Dictionary containing the RoPE scaling configuration. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + vocab_size (`int`, *optional*, defaults to 260): + Vocabulary size of the Blt patcher model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling the patcher model. + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the hidden representations. + num_hidden_layers (`int`, *optional*, defaults to 14): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to + `num_attention_heads`. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + intermediate_size (`int`, *optional*, defaults to 2048): + Dimension of the MLP representations. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ model_type = "blt_patcher" def __init__( self, - vocab_size=260, - hidden_size=768, - num_hidden_layers=14, - num_attention_heads=12, - num_key_value_heads=None, - max_position_embeddings=8192, - rms_norm_eps=1e-5, - dropout=0.0, - rope_theta=10000.0, - intermediate_size=2048, - rope_scaling=None, - initializer_range=0.02, + vocab_size: Optional[int] = 260, + hidden_size: Optional[int] = 768, + num_hidden_layers: Optional[int] = 14, + num_attention_heads: Optional[int] = 12, + num_key_value_heads: Optional[int] = None, + max_position_embeddings: Optional[int] = 8192, + rms_norm_eps: Optional[float] = 1e-5, + dropout: Optional[float] = 0.0, + intermediate_size: Optional[int] = 2048, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + initializer_range: Optional[float] = 0.02, **kwargs, ): self.vocab_size = vocab_size @@ -228,11 +244,16 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.rms_norm_eps = rms_norm_eps self.dropout = dropout - self.rope_theta = rope_theta self.hidden_act = "silu" # Blt uses silu activation self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) - self.rope_scaling = rope_scaling self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -248,47 +269,47 @@ class BltConfig(PreTrainedConfig): documentation from [`PreTrainedConfig`] for more information. Args: - vocab_size (`int`, *optional*, defaults to 260): - Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`BltModel`]. - max_position_embeddings (`int`, *optional*, defaults to 4096): - The maximum sequence length that this model might ever be used with. - patch_in_forward (`bool`, *optional*, defaults to `True`): - Whether to perform patching during the forward pass. - patch_size (`int`, *optional*, defaults to 4): - Size of the patches used in the patching mechanism. - patching_mode (`str`, *optional*, defaults to `"entropy"`): - The mode used for patching, such as entropy-based patching. - patching_threshold (`float`, *optional*, defaults to 1.34): - Threshold value used for determining when to apply patches. - patching_batch_size (`int`, *optional*, defaults to 1): - Batch size used during the patching process. - max_patch_length (`int`, *optional*): - Maximum length of patches that can be generated. - cross_attn_k (`int`, *optional*, defaults to 2): - Number of cross-attention heads used in the model. - encoder_hash_byte_group_size (`list`, *optional*): - List of byte group sizes used in the encoder hash function. - encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002): - Vocabulary size for the encoder hash byte groups. - encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1): - Number of hash functions used in the encoder byte grouping. - patcher_config (`BltPatcherConfig`, *optional*): - Configuration for the patcher component of the model. - encoder_config (`BltLocalEncoderConfig`, *optional*): - Configuration for the local encoder component of the model. - decoder_config (`BltLocalDecoderConfig`, *optional*): - Configuration for the local decoder component of the model. - global_config (`BltGlobalTransformerConfig`, *optional*): - Configuration for the global transformer component of the model. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the RoPE scaling configuration. + vocab_size (`int`, *optional*, defaults to 260): + Vocabulary size of the Blt model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BltModel`]. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. + patch_in_forward (`bool`, *optional*, defaults to `True`): + Whether to perform patching during the forward pass. + patch_size (`int`, *optional*, defaults to 4): + Size of the patches used in the patching mechanism. + patching_mode (`str`, *optional*, defaults to `"entropy"`): + The mode used for patching, such as entropy-based patching. + patching_threshold (`float`, *optional*, defaults to 1.34): + Threshold value used for determining when to apply patches. + patching_batch_size (`int`, *optional*, defaults to 1): + Batch size used during the patching process. + max_patch_length (`int`, *optional*): + Maximum length of patches that can be generated. + cross_attn_k (`int`, *optional*, defaults to 2): + Number of cross-attention heads used in the model. + encoder_hash_byte_group_size (`list`, *optional*): + List of byte group sizes used in the encoder hash function. + encoder_hash_byte_group_vocab (`int`, *optional*, defaults to 500002): + Vocabulary size for the encoder hash byte groups. + encoder_hash_byte_group_nb_functions (`int`, *optional*, defaults to 1): + Number of hash functions used in the encoder byte grouping. + patcher_config (`BltPatcherConfig`, *optional*): + Configuration for the patcher component of the model. + encoder_config (`BltLocalEncoderConfig`, *optional*): + Configuration for the local encoder component of the model. + decoder_config (`BltLocalDecoderConfig`, *optional*): + Configuration for the local decoder component of the model. + global_config (`BltGlobalTransformerConfig`, *optional*): + Configuration for the global transformer component of the model. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. ```python >>> from transformers import BltModel, BltConfig @@ -317,34 +338,31 @@ class BltConfig(PreTrainedConfig): def __init__( self, - vocab_size=260, - max_position_embeddings=4096, - patch_in_forward=True, - patch_size=4, - patching_mode="entropy", - patching_threshold=1.335442066192627, - patching_batch_size=1, - max_patch_length=None, - cross_attn_k=2, - encoder_hash_byte_group_size=None, - encoder_hash_byte_group_vocab=500002, - encoder_hash_byte_group_nb_functions=1, - patcher_config=None, - encoder_config=None, - decoder_config=None, - global_config=None, - tie_word_embeddings=False, - initializer_range=0.02, - rope_theta=500000.0, - rope_scaling=None, + vocab_size: Optional[int] = 260, + max_position_embeddings: Optional[int] = 4096, + patch_in_forward: Optional[bool] = True, + patch_size: Optional[int] = 4, + patching_mode: Optional[str] = "entropy", + patching_threshold: Optional[float] = 1.335442066192627, + patching_batch_size: Optional[int] = 1, + max_patch_length: Optional[int] = None, + cross_attn_k: Optional[int] = 2, + encoder_hash_byte_group_size: Optional[int] = None, + encoder_hash_byte_group_vocab: Optional[int] = 500002, + encoder_hash_byte_group_nb_functions: Optional[int] = 1, + patcher_config: Optional[dict] = None, + encoder_config: Optional[dict] = None, + decoder_config: Optional[dict] = None, + global_config: Optional[dict] = None, + tie_word_embeddings: Optional[bool] = False, + initializer_range: Optional[float] = 0.02, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): # Basic model configuration self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling # Patching configuration self.patch_in_forward = patch_in_forward @@ -357,6 +375,13 @@ def __init__( self.realtime_patching = kwargs.get("realtime_patching", True) self.patching_threshold_add = kwargs.get("patching_threshold_add") self.monotonicity = kwargs.get("monotonicity", False) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) # Cross attention configurations self.cross_attn_k = cross_attn_k diff --git a/src/transformers/models/blt/convert_blt_weights_to_hf.py b/src/transformers/models/blt/convert_blt_weights_to_hf.py index f9decff3a1f8..8325b6e1db1f 100644 --- a/src/transformers/models/blt/convert_blt_weights_to_hf.py +++ b/src/transformers/models/blt/convert_blt_weights_to_hf.py @@ -87,7 +87,7 @@ def merge_configurations(config_path: str, entropy_params_path: str) -> dict[str "max_position_embeddings": unified_config.get("max_encoder_seq_length") or unified_config.get("max_seqlen", 1024), "rope_theta": unified_config.get("rope_theta", 10000.0), - "rope_scaling": {"rope_type": "default"}, + "rope_parameters": {"rope_type": "default"}, "hidden_act": unified_config.get("hidden_act", "silu"), "_attn_implementation": unified_config.get("_attn_implementation", "sdpa"), "intermediate_size": encoder_intermediate_size, @@ -114,7 +114,7 @@ def merge_configurations(config_path: str, entropy_params_path: str) -> dict[str "max_position_embeddings": unified_config.get("max_encoder_seq_length") or unified_config.get("max_seqlen", 1024), "rope_theta": unified_config.get("rope_theta", 10000.0), - "rope_scaling": {"rope_type": "default"}, + "rope_parameters": {"rope_type": "default"}, "hidden_act": unified_config.get("hidden_act", "silu"), "_attn_implementation": unified_config.get("_attn_implementation", "sdpa"), "intermediate_size": decoder_intermediate_size, @@ -136,7 +136,7 @@ def merge_configurations(config_path: str, entropy_params_path: str) -> dict[str "dropout": unified_config.get("dropout", 0.0), "max_position_embeddings": unified_config.get("max_seqlen", 1024), "rope_theta": unified_config.get("rope_theta", 10000.0), - "rope_scaling": {"rope_type": "default"}, + "rope_parameters": {"rope_type": "default"}, "hidden_act": unified_config.get("hidden_act", "silu"), "_attn_implementation": unified_config.get("_attn_implementation", "sdpa"), "intermediate_size": global_intermediate_size, diff --git a/src/transformers/models/blt/modeling_blt.py b/src/transformers/models/blt/modeling_blt.py index 054f3ff26f19..e7ecb1ae6389 100644 --- a/src/transformers/models/blt/modeling_blt.py +++ b/src/transformers/models/blt/modeling_blt.py @@ -90,20 +90,49 @@ class BltRotaryEmbedding(nn.Module): def __init__(self, config: BltConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[BltConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -145,7 +174,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -278,7 +307,7 @@ def __init__(self, config: BltConfig, layer_idx: int): self.head_dim = config.hidden_size // self.num_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta + self.layer_idx = layer_idx self.is_causal = True diff --git a/src/transformers/models/blt/modular_blt.py b/src/transformers/models/blt/modular_blt.py index 70ab95ad8bc7..f25380d7417c 100644 --- a/src/transformers/models/blt/modular_blt.py +++ b/src/transformers/models/blt/modular_blt.py @@ -25,14 +25,13 @@ from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging from ...utils.generic import OutputRecorder, check_model_inputs -from ..cohere2.modeling_cohere2 import ( - Cohere2RotaryEmbedding, - rotate_half, # noqa: F401 -) +from ..cohere2.modeling_cohere2 import rotate_half # noqa: F401 +from ..llama.modeling_llama import LlamaRotaryEmbedding from ..mllama.modeling_mllama import ( MllamaForCausalLM, MllamaPreTrainedModel, @@ -270,8 +269,21 @@ class BltRMSNorm(MllamaTextRMSNorm): pass -class BltRotaryEmbedding(Cohere2RotaryEmbedding): - pass +class BltRotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class BltTransformerLayer(MllamaSelfAttentionDecoderLayer): diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index dc75c1730434..72e6eccca2a3 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,6 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, standardize_rope_params from ...utils import logging @@ -147,16 +148,10 @@ class ChameleonConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/Localchameleon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -193,30 +188,29 @@ class ChameleonConfig(PreTrainedConfig): def __init__( self, - vocab_size=65536, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - model_parallel_size=1, - swin_norm=False, - vq_config=None, - vocabulary_map=None, - mlp_bias=False, + vocab_size: Optional[int] = 65536, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + hidden_act: Optional[int] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[int] = False, + attention_dropout: Optional[float] = 0.0, + model_parallel_size: Optional[int] = 1, + swin_norm: Optional[bool] = False, + vq_config: Optional[dict] = None, + vocabulary_map: Optional[dict] = None, + mlp_bias: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -232,13 +226,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.model_parallel_size = model_parallel_size self.swin_norm = swin_norm + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) if vq_config is None: vq_config = {} @@ -257,26 +255,5 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - __all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"] diff --git a/src/transformers/models/chameleon/modeling_chameleon.py b/src/transformers/models/chameleon/modeling_chameleon.py index 845e170325ce..3555ab593cfd 100644 --- a/src/transformers/models/chameleon/modeling_chameleon.py +++ b/src/transformers/models/chameleon/modeling_chameleon.py @@ -29,6 +29,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( @@ -64,70 +65,70 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon -# TODO(joao): add me back asap :) +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon class ChameleonRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + def __init__(self, config: ChameleonConfig, device=None): super().__init__() - self.scaling_factor = scaling_factor - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ChameleonConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies inv_freq = 1.0 / ( - self.base - ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim) + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) ) - self.register_buffer("inv_freq", inv_freq, persistent=False) - # For BC we register cos and sin cached - self.max_seq_len_cached = max_position_embeddings + return inv_freq, attention_factor @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): - # x: [bs, num_attention_heads, seq_len, head_size] - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 - device_type = x.device.type - device_type = device_type if device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding): - """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling - def forward(self, x, position_ids): - # difference to the original RoPE: a scaling factor is applied to the position ids - position_ids = position_ids.float() / self.scaling_factor - cos, sin = super().forward(x, position_ids) - return cos, sin - - -class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding): - """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" - - def forward(self, x, position_ids): - # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length - seq_len = torch.max(position_ids) + 1 - if seq_len > self.max_position_embeddings: - base = self.base * ( - (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) - ) ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / ( - base - ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=x.device, dtype=torch.float) / self.dim) - ) - self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation - - cos, sin = super().forward(x, position_ids) - return cos, sin + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) # Copied from transformers.models.llama.modeling_llama.rotate_half @@ -263,7 +264,7 @@ def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.is_causal = True self.model_parallel_size = config.model_parallel_size self.scaling = self.head_dim**-0.5 @@ -280,36 +281,6 @@ def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None): self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias) self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim)) self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim)) - self._init_rope() - - # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon - # TODO(joao): add me back asap :) - def _init_rope(self): - if self.config.rope_scaling is None: - self.rotary_emb = ChameleonRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - base=self.rope_theta, - ) - else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] - if scaling_type == "linear": - self.rotary_emb = ChameleonLinearScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - elif scaling_type == "dynamic": - self.rotary_emb = ChameleonDynamicNTKScalingRotaryEmbedding( - self.head_dim, - max_position_embeddings=self.max_position_embeddings, - scaling_factor=scaling_factor, - base=self.rope_theta, - ) - else: - raise ValueError(f"Unknown RoPE scaling type {scaling_type}") def forward( self, @@ -320,6 +291,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -338,7 +310,7 @@ def forward( key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: @@ -388,6 +360,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -422,6 +395,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = residual + hidden_states @@ -460,6 +434,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -493,6 +468,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = self.input_layernorm(hidden_states) @@ -855,6 +831,7 @@ def __init__(self, config: ChameleonConfig): ) self.norm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.vqmodel = ChameleonVQVAE._from_config(config.vq_config) + self.rotary_emb = ChameleonRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -979,6 +956,7 @@ def forward( # embed positions hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -996,6 +974,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 00e97e425332..ac75ea93c864 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -19,8 +19,10 @@ # limitations under the License. """Cohere model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, standardize_rope_params from ...utils import logging @@ -78,45 +80,10 @@ class CohereConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -156,27 +123,26 @@ class CohereConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=8192, - intermediate_size=22528, - logit_scale=0.0625, - num_hidden_layers=40, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=5, - eos_token_id=255001, - tie_word_embeddings=True, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - use_qk_norm=False, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 8192, + intermediate_size: Optional[int] = 22528, + logit_scale: Optional[float] = 0.0625, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 5, + eos_token_id: Optional[int] = 255001, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + use_qk_norm: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -196,14 +162,16 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_qk_norm = use_qk_norm + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - rope_config_validation(self) + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 8b1e753659ca..71eb4870fbf2 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -70,20 +70,49 @@ class CohereRotaryEmbedding(nn.Module): def __init__(self, config: CohereConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[CohereConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -296,7 +325,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -415,16 +444,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/cohere/modular_cohere.py b/src/transformers/models/cohere/modular_cohere.py index 73a119a715c2..5147e5638eb2 100644 --- a/src/transformers/models/cohere/modular_cohere.py +++ b/src/transformers/models/cohere/modular_cohere.py @@ -213,7 +213,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -260,7 +260,6 @@ def __init__(self, config: CohereConfig): self.layers = nn.ModuleList( [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.rotary_emb = CohereRotaryEmbedding(config=config) self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index bfd00c5e8530..7bf87307ee1d 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -19,8 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Cohere2Config(PreTrainedConfig): @@ -74,45 +76,10 @@ class Cohere2Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -155,28 +122,27 @@ class Cohere2Config(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=8192, - intermediate_size=22528, - logit_scale=0.0625, - num_hidden_layers=40, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=5, - eos_token_id=255001, - tie_word_embeddings=True, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - sliding_window=4096, - layer_types=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 8192, + intermediate_size: Optional[int] = 22528, + logit_scale: Optional[float] = 0.0625, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[int] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 5, + eos_token_id: Optional[int] = 255001, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -196,18 +162,16 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - # Validate the correctness of rotary position embeddings parameters - rope_config_validation(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -228,5 +192,10 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modeling_cohere2.py b/src/transformers/models/cohere2/modeling_cohere2.py index 0ed644d97080..8a9929dc3ff2 100644 --- a/src/transformers/models/cohere2/modeling_cohere2.py +++ b/src/transformers/models/cohere2/modeling_cohere2.py @@ -45,20 +45,49 @@ class Cohere2RotaryEmbedding(nn.Module): def __init__(self, config: Cohere2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Cohere2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -181,7 +210,8 @@ def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None): self.scaling = self.head_dim**-0.5 self.attention_dropout = config.attention_dropout self.is_causal = True - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if layer_type == "sliding_attention" else None self.q_proj = nn.Linear( config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias @@ -269,7 +299,7 @@ def __init__(self, config: Cohere2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, @@ -343,7 +373,7 @@ def __init__(self, config: Cohere2Config): [Cohere2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) - self.rotary_emb = Cohere2RotaryEmbedding(config=config) + self.rotary_emb = Cohere2RotaryEmbedding(config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -399,11 +429,12 @@ def forward( for decoder_layer in self.layers: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 3b4f605d64a5..dab998730c77 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -24,7 +24,12 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import ( + RopeParameters, + dynamic_rope_update, + rope_config_validation, + standardize_rope_params, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -95,45 +100,10 @@ class Cohere2Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -176,28 +146,27 @@ class Cohere2Config(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=8192, - intermediate_size=22528, - logit_scale=0.0625, - num_hidden_layers=40, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=8192, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=5, - eos_token_id=255001, - tie_word_embeddings=True, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - sliding_window=4096, - layer_types=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 8192, + intermediate_size: Optional[int] = 22528, + logit_scale: Optional[float] = 0.0625, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[int] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 5, + eos_token_id: Optional[int] = 255001, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -217,18 +186,16 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - # Validate the correctness of rotary position embeddings parameters - rope_config_validation(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -249,9 +216,27 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class Cohere2RotaryEmbedding(CohereRotaryEmbedding): - pass + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.repeat_interleave(freqs, 2, dim=-1) # diff from Llama: we interleave() instead of cat() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class Cohere2LayerNorm(CohereLayerNorm): @@ -270,7 +255,8 @@ def __init__(self, config: Cohere2Config, layer_idx: Optional[int] = None): self.scaling = self.head_dim**-0.5 self.attention_dropout = config.attention_dropout self.is_causal = True - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if layer_type == "sliding_attention" else None self.q_proj = nn.Linear( config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias @@ -338,7 +324,7 @@ def __init__(self, config: Cohere2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, @@ -370,7 +356,6 @@ class Cohere2Model(Gemma2Model): def __init__(self, config: Cohere2Config): super().__init__(config) self.norm = Cohere2LayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) - self.rotary_emb = Cohere2RotaryEmbedding(config=config) def forward( self, @@ -420,11 +405,12 @@ def forward( for decoder_layer in self.layers: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 8604951436c2..227609c2f1aa 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -13,8 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -74,45 +76,10 @@ class CsmDepthDecoderConfig(PreTrainedConfig): Beginning of stream token id. eos_token_id (`int`, *optional*): End of stream token id. - rope_theta (`float`, *optional*, defaults to 500000): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -139,28 +106,27 @@ class CsmDepthDecoderConfig(PreTrainedConfig): def __init__( self, - num_codebooks=32, - backbone_hidden_size=2048, - vocab_size=2051, - hidden_size=1024, - intermediate_size=8192, - num_hidden_layers=4, - num_attention_heads=8, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=33, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=None, - eos_token_id=None, - rope_theta=500000, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + num_codebooks: Optional[int] = 32, + backbone_hidden_size: Optional[int] = 2048, + vocab_size: Optional[int] = 2051, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 4, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[int] = "silu", + max_position_embeddings: Optional[int] = 33, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): if kwargs.pop("tie_word_embeddings", False): @@ -191,16 +157,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) @@ -262,45 +229,10 @@ class CsmConfig(PreTrainedConfig): Audio token id in the text input. audio_eos_token_id (`int`, *optional*, defaults to 128003): End of stream token id for audio in the text input. - rope_theta (`float`, *optional*, defaults to 500000): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*, defaults to `{'factor': 32.0, 'high_freq_factor': 0.5, 'low_freq_factor': 0.125, 'original_max_position_embeddings': 1024, 'rope_type': 'llama3'}`): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -339,35 +271,34 @@ class CsmConfig(PreTrainedConfig): def __init__( self, - num_codebooks=32, - vocab_size=2051, - text_vocab_size=128256, - hidden_size=2048, - intermediate_size=8192, - num_hidden_layers=16, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=128002, - codebook_pad_token_id=2050, - codebook_eos_token_id=0, - bos_token_id=128000, - eos_token_id=None, - audio_token_id=128002, - audio_eos_token_id=128003, - rope_theta=500000, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, - tie_codebooks_embeddings=True, - depth_decoder_config=None, - codec_config=None, + num_codebooks: Optional[int] = 32, + vocab_size: Optional[int] = 2051, + text_vocab_size: Optional[int] = 128256, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 16, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 128002, + codebook_pad_token_id: Optional[int] = 2050, + codebook_eos_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = None, + audio_token_id: Optional[int] = 128002, + audio_eos_token_id: Optional[int] = 128003, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, + tie_codebooks_embeddings: Optional[bool] = True, + depth_decoder_config: Optional[dict] = None, + codec_config: Optional[dict] = None, **kwargs, ): if kwargs.pop("tie_word_embeddings", False): @@ -413,16 +344,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/csm/convert_csm.py b/src/transformers/models/csm/convert_csm.py index 28fbc9fe490d..ddbe154af70b 100644 --- a/src/transformers/models/csm/convert_csm.py +++ b/src/transformers/models/csm/convert_csm.py @@ -92,7 +92,7 @@ def write_model( # prepare rope scaling args: the model uses originally # 1 - for the depth decoder # rope_theta=500000, - # rope_scaling={ + # rope_parameters={ # "factor": 32.0, # "high_freq_factor": 4.0, # "low_freq_factor": 1.0, @@ -101,7 +101,7 @@ def write_model( # }, # 2 - for the backbone # rope_theta=500000, - # rope_scaling={ + # rope_parameters={ # "factor": 32.0, # "high_freq_factor": 4.0, # "low_freq_factor": 1.0, @@ -114,7 +114,7 @@ def write_model( # Therefore, we convert values to equivalent ones depth_decoder_config = CsmDepthDecoderConfig( - rope_scaling={ + rope_parameters={ "factor": 32.0, "high_freq_factor": 0.0078125, "low_freq_factor": 0.001953125, @@ -126,7 +126,7 @@ def write_model( config = CsmConfig( codec_config=codec_model.config, depth_decoder_config=depth_decoder_config, - rope_scaling={ + rope_parameters={ "factor": 32.0, "high_freq_factor": 0.5, "low_freq_factor": 0.125, diff --git a/src/transformers/models/csm/modeling_csm.py b/src/transformers/models/csm/modeling_csm.py index 13eff55f772e..0820423d30bc 100644 --- a/src/transformers/models/csm/modeling_csm.py +++ b/src/transformers/models/csm/modeling_csm.py @@ -121,20 +121,49 @@ class CsmRotaryEmbedding(nn.Module): def __init__(self, config: CsmConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[CsmConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -269,8 +298,8 @@ def __init__(self, config: CsmConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -329,7 +358,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -473,7 +502,7 @@ def forward( # create position embeddings to be shared across the decoder layers position_ids = cache_position.unsqueeze(0) - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( @@ -713,16 +742,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/csm/modular_csm.py b/src/transformers/models/csm/modular_csm.py index 9967a1d97287..20aeb0540f43 100644 --- a/src/transformers/models/csm/modular_csm.py +++ b/src/transformers/models/csm/modular_csm.py @@ -222,7 +222,7 @@ def forward( # create position embeddings to be shared across the decoder layers position_ids = cache_position.unsqueeze(0) - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 573956a5bca0..765f7f713247 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import rope_config_validation, standardize_rope_params class CwmConfig(PreTrainedConfig): @@ -71,8 +71,6 @@ class CwmConfig(PreTrainedConfig): The id of the *beginning-of-sequence* token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. pretraining_tp (`int`, *optional*, defaults to 1): @@ -81,8 +79,10 @@ class CwmConfig(PreTrainedConfig): issue](https://github.com/pytorch/pytorch/issues/76232). mlp_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 8192): Sliding window attention window size. layer_types (`List[str]`, *optional*): @@ -126,18 +126,18 @@ def __init__( eos_token_id=[128001, 128008, 128009], bos_token_id: int = 128000, tie_word_embeddings: bool = False, - rope_theta: float = 1_000_000.0, attention_dropout: float = 0.0, pretraining_tp: int = 1, mlp_bias: bool = False, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[dict] = None, # CWM interleaved sliding window fields sliding_window: int = 8192, layer_types: Optional[list[str]] = None, # ["full_attention"|"sliding_attention"] per layer **kwargs, ): - if rope_scaling is None: - rope_scaling = { + if rope_parameters is None: + rope_parameters = { + "rope_theta": 1_000_000.0, "factor": 16.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, @@ -154,6 +154,9 @@ def __init__( ] else: layer_type_validation(layer_types, num_hidden_layers) + + self.sliding_window = int(sliding_window) if sliding_window else None + self.layer_types = list(layer_types) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -171,15 +174,16 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 1_000_000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( @@ -190,8 +194,5 @@ def __init__( **kwargs, ) - self.sliding_window = int(sliding_window) if sliding_window else None - self.layer_types = list(layer_types) - __all__ = ["CwmConfig"] diff --git a/src/transformers/models/cwm/modeling_cwm.py b/src/transformers/models/cwm/modeling_cwm.py index 5ab732e7d650..cf4d996b0c49 100644 --- a/src/transformers/models/cwm/modeling_cwm.py +++ b/src/transformers/models/cwm/modeling_cwm.py @@ -41,6 +41,71 @@ from .configuration_cwm import CwmConfig +class CwmRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: CwmConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[CwmConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -118,6 +183,7 @@ class CwmAttention(nn.Module): def __init__(self, config: CwmConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -129,7 +195,7 @@ def __init__(self, config: CwmConfig, layer_idx: int): self.k_proj = torch.nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) self.v_proj = torch.nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -232,7 +298,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -281,42 +347,6 @@ class CwmModelOutputWithPast(BaseModelOutputWithPast): pass -class CwmRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: CwmConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class CwmModel(CwmPreTrainedModel): config_class = CwmConfig diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 022fe9c21c19..df2a003438a8 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -21,6 +21,7 @@ from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import standardize_rope_params from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ..llama.configuration_llama import LlamaConfig @@ -30,7 +31,7 @@ LlamaModel, LlamaPreTrainedModel, ) -from ..qwen2.modeling_qwen2 import Qwen2Attention +from ..qwen2.modeling_qwen2 import Qwen2Attention, Qwen2RotaryEmbedding logger = logging.get_logger(__name__) @@ -82,8 +83,6 @@ class CwmConfig(LlamaConfig): The id of the *beginning-of-sequence* token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. pretraining_tp (`int`, *optional*, defaults to 1): @@ -92,8 +91,10 @@ class CwmConfig(LlamaConfig): issue](https://github.com/pytorch/pytorch/issues/76232). mlp_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 8192): Sliding window attention window size. layer_types (`List[str]`, *optional*): @@ -121,18 +122,18 @@ def __init__( eos_token_id=[128001, 128008, 128009], bos_token_id: int = 128000, tie_word_embeddings: bool = False, - rope_theta: float = 1_000_000.0, attention_dropout: float = 0.0, pretraining_tp: int = 1, mlp_bias: bool = False, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[dict] = None, # CWM interleaved sliding window fields sliding_window: int = 8192, layer_types: Optional[list[str]] = None, # ["full_attention"|"sliding_attention"] per layer **kwargs, ): - if rope_scaling is None: - rope_scaling = { + if rope_parameters is None: + rope_parameters = { + "rope_theta": 1_000_000.0, "factor": 16.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, @@ -150,6 +151,9 @@ def __init__( else: layer_type_validation(layer_types, num_hidden_layers) + self.sliding_window = int(sliding_window) if sliding_window else None + self.layer_types = list(layer_types) + super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, @@ -167,10 +171,9 @@ def __init__( eos_token_id=list(eos_token_id), bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, attention_bias=False, attention_dropout=attention_dropout, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, pretraining_tp=pretraining_tp, mlp_bias=mlp_bias, **kwargs, @@ -179,8 +182,13 @@ def __init__( # CWM models don't use attention bias, remove it from config del self.attention_bias - self.sliding_window = int(sliding_window) if sliding_window else None - self.layer_types = list(layer_types) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1_000_000.0) + standardize_rope_params(self, rope_theta=rope_theta) + + +class CwmRotaryEmbedding(Qwen2RotaryEmbedding): + pass class CwmAttention(Qwen2Attention): diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index bb4888f3ee8c..987cb8a8ac06 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,6 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -37,8 +38,8 @@ class DbrxAttentionConfig(PreTrainedConfig): The dropout probability for the attention layers. clip_qkv (`float`, *optional*): If set, clip the queries, keys, and values in the attention layer to this value. - kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads. - rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope. + kv_n_heads (`int`, *optional*, defaults to 1): + For grouped_query_attention only, allow user to specify number of kv heads. """ base_config_key = "attn_config" @@ -176,18 +177,19 @@ class DbrxConfig(PreTrainedConfig): def __init__( self, - d_model: int = 2048, - n_heads: int = 16, - n_layers: int = 24, - max_seq_len: int = 2048, - vocab_size: int = 32000, - resid_pdrop: float = 0.0, - emb_pdrop: float = 0.0, + d_model: Optional[int] = 2048, + n_heads: Optional[int] = 16, + n_layers: Optional[int] = 24, + max_seq_len: Optional[int] = 2048, + vocab_size: Optional[int] = 32000, + resid_pdrop: Optional[float] = 0.0, + emb_pdrop: Optional[float] = 0.0, attn_config: Optional[DbrxAttentionConfig] = None, ffn_config: Optional[DbrxFFNConfig] = None, - use_cache: bool = True, - initializer_range: float = 0.02, - output_router_logits: bool = False, + use_cache: Optional[bool] = True, + initializer_range: Optional[float] = 0.02, + output_router_logits: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs: Any, ): if attn_config is None: @@ -215,11 +217,18 @@ def __init__( self.initializer_range = initializer_range self.output_router_logits = output_router_logits self.num_key_value_heads = self.attn_config.kv_n_heads - self.rope_theta: float = 10000.0 tie_word_embeddings = kwargs.pop("tie_word_embeddings", False) if tie_word_embeddings: raise ValueError("tie_word_embeddings is not supported for DBRX models.") + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + standardize_rope_params(self, rope_theta=10000.0) + rope_config_validation(self) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 9cf603f3f973..a3f995d35b95 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -44,20 +44,49 @@ class DbrxRotaryEmbedding(nn.Module): def __init__(self, config: DbrxConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DbrxConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 1d803891d128..7e5a8c93feec 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -19,9 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class DeepseekV2Config(PreTrainedConfig): @@ -66,10 +67,10 @@ class DeepseekV2Config(PreTrainedConfig): End-of-sequence token ID. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie input and output embeddings. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the Rotary Position Embeddings (RoPE). - rope_scaling (`Dict`, *optional*): - Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value, and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -138,40 +139,39 @@ class DeepseekV2Config(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - first_k_dense_replace=0, - kv_lora_rank=512, - q_lora_rank=1536, - n_group=None, - n_routed_experts=64, - n_shared_experts=2, - qk_nope_head_dim=128, - qk_rope_head_dim=64, - routed_scaling_factor=1.0, - topk_group=None, - topk_method="greedy", - v_head_dim=128, - num_experts_per_tok=None, - moe_intermediate_size=1407, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + first_k_dense_replace: Optional[int] = 0, + kv_lora_rank: Optional[int] = 512, + q_lora_rank: Optional[int] = 1536, + n_group: Optional[int] = None, + n_routed_experts: Optional[int] = 64, + n_shared_experts: Optional[int] = 2, + qk_nope_head_dim: Optional[int] = 128, + qk_rope_head_dim: Optional[int] = 64, + routed_scaling_factor: Optional[float] = 1.0, + topk_group: Optional[int] = None, + topk_method: Optional[str] = "greedy", + v_head_dim: Optional[int] = 128, + num_experts_per_tok: Optional[int] = None, + moe_intermediate_size: Optional[int] = 1407, **kwargs, ): self.first_k_dense_replace = first_k_dense_replace @@ -204,17 +204,18 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = qk_rope_head_dim + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py index 7a0f4ac288a9..0731480c0371 100644 --- a/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modeling_deepseek_v2.py @@ -167,22 +167,49 @@ class DeepseekV2RotaryEmbedding(nn.Module): def __init__(self, config: DeepseekV2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - self.rope_type = ( - config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - if config.rope_scaling is not None - else "default" - ) - self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DeepseekV2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -265,7 +292,7 @@ def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None): self.num_heads = config.num_attention_heads self.head_dim = config.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank @@ -390,7 +417,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -498,16 +525,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index 1d5f9e7f8117..90b6eb77e5e0 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -22,10 +22,9 @@ from torch import nn from ...cache_utils import Cache +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel -from ...utils import ( - logging, -) +from ...utils import logging from ..llama.configuration_llama import LlamaConfig from ..llama.modeling_llama import ( LlamaDecoderLayer, @@ -35,9 +34,9 @@ LlamaModel, LlamaPreTrainedModel, LlamaRMSNorm, + LlamaRotaryEmbedding, eager_attention_forward, ) -from ..llama4.modeling_llama4 import Llama4TextRotaryEmbedding from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeExperts @@ -86,10 +85,10 @@ class DeepseekV2Config(LlamaConfig): End-of-sequence token ID. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie input and output embeddings. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the Rotary Position Embeddings (RoPE). - rope_scaling (`Dict`, *optional*): - Configuration for scaling RoPE embeddings. Supports `linear` and `dynamic` scaling strategies. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value, and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -153,40 +152,39 @@ class DeepseekV2Config(LlamaConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - first_k_dense_replace=0, - kv_lora_rank=512, - q_lora_rank=1536, - n_group=None, - n_routed_experts=64, - n_shared_experts=2, - qk_nope_head_dim=128, - qk_rope_head_dim=64, - routed_scaling_factor=1.0, - topk_group=None, - topk_method="greedy", - v_head_dim=128, - num_experts_per_tok=None, - moe_intermediate_size=1407, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + first_k_dense_replace: Optional[int] = 0, + kv_lora_rank: Optional[int] = 512, + q_lora_rank: Optional[int] = 1536, + n_group: Optional[int] = None, + n_routed_experts: Optional[int] = 64, + n_shared_experts: Optional[int] = 2, + qk_nope_head_dim: Optional[int] = 128, + qk_rope_head_dim: Optional[int] = 64, + routed_scaling_factor: Optional[float] = 1.0, + topk_group: Optional[int] = None, + topk_method: Optional[str] = "greedy", + v_head_dim: Optional[int] = 128, + num_experts_per_tok: Optional[int] = None, + moe_intermediate_size: Optional[int] = 1407, **kwargs, ): self.first_k_dense_replace = first_k_dense_replace @@ -294,15 +292,20 @@ class DeepseekV2RMSNorm(LlamaRMSNorm): pass -class DeepseekV2RotaryEmbedding(Llama4TextRotaryEmbedding): - def __init__(self, config: DeepseekV2Config, device=None): - super().__init__(config=config, device=device) - # BC: "rope_type" was originally "type" - self.rope_type = ( - config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - if config.rope_scaling is not None - else "default" - ) +class DeepseekV2RotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.to(x.device) @ position_ids_expanded).transpose(1, 2) + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # Convert to complex representation + freqs_cis = freqs_cis * self.attention_scaling + + return freqs_cis class DeepseekV2Attention(nn.Module): @@ -317,7 +320,7 @@ def __init__(self, config: DeepseekV2Config, layer_idx: Optional[int] = None): self.num_heads = config.num_attention_heads self.head_dim = config.head_dim self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 7c2ae71dca55..eed1ea34def4 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -16,8 +16,10 @@ # limitations under the License. """DeepSeekV3 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} @@ -106,13 +108,10 @@ class DeepseekV3Config(PreTrainedConfig): issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. rope_interleave (`bool`, *optional*, defaults to `True`): Whether to interleave the rotary position embeddings. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): @@ -157,41 +156,40 @@ class DeepseekV3Config(PreTrainedConfig): def __init__( self, - vocab_size=129280, - hidden_size=7168, - intermediate_size=18432, - moe_intermediate_size=2048, - num_hidden_layers=61, - num_attention_heads=128, - num_key_value_heads=128, - n_shared_experts=1, - n_routed_experts=256, - routed_scaling_factor=2.5, - kv_lora_rank=512, - q_lora_rank=1536, - qk_rope_head_dim=64, - v_head_dim=128, - qk_nope_head_dim=128, - n_group=8, - topk_group=4, - num_experts_per_tok=8, - first_k_dense_replace=3, - norm_topk_prob=True, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=0, - eos_token_id=1, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - rope_interleave=True, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 129280, + hidden_size: Optional[int] = 7168, + intermediate_size: Optional[int] = 18432, + moe_intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 61, + num_attention_heads: Optional[int] = 128, + num_key_value_heads: Optional[int] = 128, + n_shared_experts: Optional[int] = 1, + n_routed_experts: Optional[int] = 256, + routed_scaling_factor: Optional[float] = 2.5, + kv_lora_rank: Optional[int] = 512, + q_lora_rank: Optional[int] = 1536, + qk_rope_head_dim: Optional[int] = 64, + v_head_dim: Optional[int] = 128, + qk_nope_head_dim: Optional[int] = 128, + n_group: Optional[int] = 8, + topk_group: Optional[int] = 4, + num_experts_per_tok: Optional[int] = 8, + first_k_dense_replace: Optional[int] = 3, + norm_topk_prob: Optional[bool] = True, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_interleave: Optional[bool] = True, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -228,19 +226,19 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) - if self.rope_scaling is not None: - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_scaling: - self.rope_scaling[key] = float(self.rope_scaling[key]) + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) rope_config_validation(self) diff --git a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py index 7ddd1f638b0a..248be6193439 100644 --- a/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/modeling_deepseek_v3.py @@ -58,20 +58,49 @@ class DeepseekV3RotaryEmbedding(nn.Module): def __init__(self, config: DeepseekV3Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DeepseekV3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -334,7 +363,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int): self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.attention_dropout = config.attention_dropout self.num_heads = config.num_attention_heads - self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank @@ -369,9 +398,9 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int): ) self.scaling = self.qk_head_dim ** (-0.5) - if self.config.rope_scaling is not None: - mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) - scaling_factor = self.config.rope_scaling["factor"] + if self.config.rope_parameters.get("rope_type", "default") != "default": + mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_parameters["factor"] if mscale_all_dim: mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) self.scaling = self.scaling * mscale * mscale @@ -468,7 +497,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -578,16 +607,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py index e9a6a7adab31..0eb25eafb955 100644 --- a/src/transformers/models/deepseek_v3/modular_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/modular_deepseek_v3.py @@ -174,7 +174,7 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int): self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.attention_dropout = config.attention_dropout self.num_heads = config.num_attention_heads - self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank @@ -209,9 +209,9 @@ def __init__(self, config: DeepseekV3Config, layer_idx: int): ) self.scaling = self.qk_head_dim ** (-0.5) - if self.config.rope_scaling is not None: - mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) - scaling_factor = self.config.rope_scaling["factor"] + if self.config.rope_parameters.get("rope_type", "default") != "default": + mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_parameters["factor"] if mscale_all_dim: mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) self.scaling = self.scaling * mscale * mscale diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py index 32827d007751..64545d7abcf6 100644 --- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py @@ -65,14 +65,10 @@ class OpenLlamaConfig(PreTrainedConfig): Whether to tie weight embeddings rope_theta (`float`, *optional*, defaults to 10000.0): The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. Example: @@ -113,7 +109,7 @@ def __init__( use_stable_embedding=True, shared_input_output_embedding=True, rope_theta=10000.0, - rope_scaling=None, + rope_parameters=None, **kwargs, ): self.vocab_size = vocab_size @@ -134,8 +130,10 @@ def __init__( self.use_stable_embedding = use_stable_embedding self.shared_input_output_embedding = shared_input_output_embedding self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + self._rope_parameters_validation() super().__init__( pad_token_id=pad_token_id, @@ -145,25 +143,29 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: raise ValueError( - f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" + f"`rope_parameters` must be a dictionary with two fields, `type` and `factor`, got {self.rope_parameters}" ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + rope_parameters_type = self.rope_parameters.get("type", None) + rope_parameters_factor = self.rope_parameters.get("factor", None) + if rope_parameters_type is None or rope_parameters_type not in ["linear", "dynamic"]: raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + f"`rope_parameters`'s type field must be one of ['linear', 'dynamic'], got {rope_parameters_type}" ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + if ( + rope_parameters_factor is None + or not isinstance(rope_parameters_factor, float) + or rope_parameters_factor <= 1.0 + ): + raise ValueError(f"`rope_parameters`'s factor field must be a float > 1, got {rope_parameters_factor}") __all__ = ["OpenLlamaConfig"] diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py index 2f5eaf532459..bf39cfca912a 100644 --- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py @@ -224,7 +224,6 @@ def __init__(self, config: OpenLlamaConfig): self.head_dim = self.hidden_size // self.num_heads self.max_position_embeddings = config.max_position_embeddings self.dropout_prob = config.attention_dropout_prob - self.rope_theta = config.rope_theta if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -238,15 +237,15 @@ def __init__(self, config: OpenLlamaConfig): self._init_rope() def _init_rope(self): - if self.config.rope_scaling is None: + if self.config.rope_parameters is None: self.rotary_emb = OpenLlamaRotaryEmbedding( self.head_dim, max_position_embeddings=self.max_position_embeddings, base=self.rope_theta, ) else: - scaling_type = self.config.rope_scaling["type"] - scaling_factor = self.config.rope_scaling["factor"] + scaling_type = self.config.rope_parameters["type"] + scaling_factor = self.config.rope_parameters["factor"] if scaling_type == "linear": self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding( self.head_dim, diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index c487781fc94c..b54b5620e524 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -55,45 +55,10 @@ class DiaEncoderConfig(PreTrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"swish"` and `"gelu_new"` are supported. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ @@ -112,8 +77,7 @@ def __init__( norm_eps: float = 1e-5, vocab_size: int = 256, hidden_act: str = "silu", - rope_theta: float = 10000.0, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[RopeParameters] = None, initializer_range: float = 0.02, **kwargs, ): @@ -127,14 +91,15 @@ def __init__( self.vocab_size = vocab_size self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) - self.initializer_range = initializer_range super().__init__(**kwargs) @@ -179,45 +144,10 @@ class DiaDecoderConfig(PreTrainedConfig): `"swish"` and `"gelu_new"` are supported. num_channels (`int`, *optional*, defaults to 9): Number of channels for the Dia decoder. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. use_cache (`bool`, *optional*, defaults to `True`): @@ -245,8 +175,7 @@ def __init__( vocab_size: int = 1028, hidden_act: str = "silu", num_channels: int = 9, - rope_theta: float = 10000.0, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[RopeParameters] = None, initializer_range: float = 0.02, use_cache: bool = True, is_encoder_decoder: bool = True, @@ -267,15 +196,16 @@ def __init__( self.vocab_size = vocab_size self.hidden_act = hidden_act self.num_channels = num_channels - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) self.initializer_range = initializer_range self.use_cache = use_cache + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/dia/modeling_dia.py b/src/transformers/models/dia/modeling_dia.py index d10867e68175..65acb4a4ca76 100644 --- a/src/transformers/models/dia/modeling_dia.py +++ b/src/transformers/models/dia/modeling_dia.py @@ -133,20 +133,49 @@ class DiaRotaryEmbedding(nn.Module): def __init__(self, config: DiaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DiaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -260,8 +289,8 @@ def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -386,7 +415,7 @@ def __init__(self, config: DiaEncoderConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -418,7 +447,7 @@ def __init__(self, config: DiaEncoderConfig): [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) - self.rotary_embeddings = DiaRotaryEmbedding(config) + self.rotary_emb = DiaRotaryEmbedding(config=config) @auto_docstring @can_return_tuple @@ -436,13 +465,13 @@ def forward( # Note: We expect right padding and hence always generate # the position ids on the fly to reduce preparation overhead position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :] - position_embeddings = self.rotary_embeddings(hidden_states, position_ids) attention_mask = create_bidirectional_mask( config=self.config, input_embeds=hidden_states, attention_mask=attention_mask, ) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -453,8 +482,9 @@ def forward( layer_outputs = encoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=attention_mask, + position_ids=position_ids, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = layer_outputs[0] @@ -486,12 +516,13 @@ def __init__(self, config: DiaDecoderConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[EncoderDecoderCache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: self_attn_cache = past_key_values @@ -539,11 +570,11 @@ def __init__(self, config: DiaDecoderConfig): self.num_channels = config.num_channels self.vocab_size = config.vocab_size self.embeddings = DiaMultiChannelEmbedding(config) - self.rotary_embeddings = DiaRotaryEmbedding(config) self.layers = nn.ModuleList( [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.rotary_emb = DiaRotaryEmbedding(config=config) @auto_docstring @can_return_tuple @@ -578,7 +609,6 @@ def forward( # RoPE hidden_states = self.embeddings(input_ids) - position_embeddings = self.rotary_embeddings(hidden_states, position_ids) if attention_mask is None and not is_torchdynamo_compiling(): # required mask seq length can be calculated via length of past cache @@ -598,6 +628,7 @@ def forward( attention_mask=encoder_attention_mask, encoder_hidden_states=encoder_hidden_states, ) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -609,12 +640,15 @@ def forward( layer_outputs = layer( hidden_states, + # Needs to be an arg in order to function properly + # on inplace operations to be carried (e.g. compile) position_embeddings, attention_mask, encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/dia/modular_dia.py b/src/transformers/models/dia/modular_dia.py index 8f20ce3dbd8e..3ea05a88050e 100644 --- a/src/transformers/models/dia/modular_dia.py +++ b/src/transformers/models/dia/modular_dia.py @@ -207,7 +207,7 @@ def __init__(self, config: DiaEncoderConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: @@ -239,7 +239,7 @@ def __init__(self, config: DiaEncoderConfig): [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) - self.rotary_embeddings = DiaRotaryEmbedding(config) + self.rotary_emb = DiaRotaryEmbedding(config=config) @auto_docstring @can_return_tuple @@ -257,13 +257,13 @@ def forward( # Note: We expect right padding and hence always generate # the position ids on the fly to reduce preparation overhead position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :] - position_embeddings = self.rotary_embeddings(hidden_states, position_ids) attention_mask = create_bidirectional_mask( config=self.config, input_embeds=hidden_states, attention_mask=attention_mask, ) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -274,8 +274,9 @@ def forward( layer_outputs = encoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=attention_mask, + position_ids=position_ids, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = layer_outputs[0] @@ -307,12 +308,13 @@ def __init__(self, config: DiaDecoderConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[EncoderDecoderCache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: self_attn_cache = past_key_values @@ -360,11 +362,11 @@ def __init__(self, config: DiaDecoderConfig): self.num_channels = config.num_channels self.vocab_size = config.vocab_size self.embeddings = DiaMultiChannelEmbedding(config) - self.rotary_embeddings = DiaRotaryEmbedding(config) self.layers = nn.ModuleList( [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps) + self.rotary_emb = DiaRotaryEmbedding(config=config) @auto_docstring @can_return_tuple @@ -399,7 +401,6 @@ def forward( # RoPE hidden_states = self.embeddings(input_ids) - position_embeddings = self.rotary_embeddings(hidden_states, position_ids) if attention_mask is None and not is_torchdynamo_compiling(): # required mask seq length can be calculated via length of past cache @@ -419,6 +420,7 @@ def forward( attention_mask=encoder_attention_mask, encoder_hidden_states=encoder_hidden_states, ) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -430,12 +432,15 @@ def forward( layer_outputs = layer( hidden_states, + # Needs to be an arg in order to function properly + # on inplace operations to be carried (e.g. compile) position_embeddings, attention_mask, encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 26902fcc2784..0eac1f506c72 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -17,8 +17,10 @@ # limitations under the License. """DiffLlama model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class DiffLlamaConfig(PreTrainedConfig): @@ -70,45 +72,10 @@ class DiffLlamaConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'diffllama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -136,27 +103,26 @@ class DiffLlamaConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=2048, - intermediate_size=8192, - num_hidden_layers=16, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - lambda_std_dev=0.1, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 16, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + lambda_std_dev: Optional[float] = 0.1, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -175,16 +141,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.lambda_std_dev = lambda_std_dev self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/diffllama/modeling_diffllama.py b/src/transformers/models/diffllama/modeling_diffllama.py index 58212c9841ef..d82430b623e1 100644 --- a/src/transformers/models/diffllama/modeling_diffllama.py +++ b/src/transformers/models/diffllama/modeling_diffllama.py @@ -22,6 +22,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -67,6 +68,71 @@ def forward(self, x): return down_proj +class DiffLlamaRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: DiffLlamaConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DiffLlamaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -139,7 +205,6 @@ def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads # under this are not used self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.is_causal = True self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) @@ -261,16 +326,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: @@ -496,7 +552,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -549,42 +605,6 @@ def _init_weights(self, module): module.lambda_k2.data.normal_(0, self.config.lambda_std_dev) -class DiffLlamaRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: DiffLlamaConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class DiffLlamaModel(DiffLlamaPreTrainedModel): def __init__(self, config: DiffLlamaConfig): @@ -644,16 +664,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/diffllama/modular_diffllama.py b/src/transformers/models/diffllama/modular_diffllama.py index aadd28fed687..331c7327b681 100644 --- a/src/transformers/models/diffllama/modular_diffllama.py +++ b/src/transformers/models/diffllama/modular_diffllama.py @@ -33,6 +33,7 @@ LlamaForTokenClassification, LlamaModel, LlamaPreTrainedModel, + LlamaRotaryEmbedding, apply_rotary_pos_emb, repeat_kv, ) @@ -54,6 +55,10 @@ def lambda_init_fn(layer_idx): return 0.8 - 0.6 * math.exp(-0.3 * layer_idx) +class DiffLlamaRotaryEmbedding(LlamaRotaryEmbedding): + pass + + class DiffLlamaAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -76,7 +81,6 @@ def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads # under this are not used self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta self.is_causal = True self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) @@ -198,16 +202,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if position_embeddings is None: - logger.warning_once( - "The attention layers in this model are transitioning from computing the RoPE embeddings internally " - "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " - "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " - "removed and `position_embeddings` will be mandatory." - ) - cos, sin = self.rotary_emb(value_states, position_ids) - else: - cos, sin = position_embeddings + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 4cbc2b866c1a..844b9519b45a 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -20,8 +20,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class DogeConfig(PreTrainedConfig): @@ -56,41 +58,10 @@ class DogeConfig(PreTrainedConfig): Whether the model's input and output word embeddings should be tied. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. - NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. - Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. - In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. - The original max position embeddings used during pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. - If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`). - Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`). - Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. num_key_value_heads (`int`, *optional*): @@ -166,32 +137,31 @@ class DogeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32768, - hidden_size=1024, - intermediate_size=2048, - num_hidden_layers=32, - hidden_dropout=0.0, - hidden_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-06, - use_cache=True, - tie_word_embeddings=False, - max_position_embeddings=2048, - rope_theta=10000.0, - rope_scaling=None, - num_attention_heads=8, - num_key_value_heads=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - sliding_window=None, - keep_window_size=2048, - is_moe=False, - num_experts=16384, - num_experts_per_tok=64, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, + vocab_size: Optional[int] = 32768, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 32, + hidden_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "silu", + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-06, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + max_position_embeddings: Optional[int] = 2048, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + keep_window_size: Optional[int] = 2048, + is_moe: Optional[bool] = False, + num_experts: Optional[int] = 16384, + num_experts_per_tok: Optional[int] = 64, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, **kwargs, ): self.vocab_size = vocab_size @@ -206,8 +176,6 @@ def __init__( self.use_cache = use_cache self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.attention_bias = attention_bias @@ -221,11 +189,13 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # for backward compatibility diff --git a/src/transformers/models/doge/modeling_doge.py b/src/transformers/models/doge/modeling_doge.py index 15d3eb9608cd..1ced8dbbdd63 100644 --- a/src/transformers/models/doge/modeling_doge.py +++ b/src/transformers/models/doge/modeling_doge.py @@ -75,20 +75,49 @@ class DogeRotaryEmbedding(nn.Module): def __init__(self, config: DogeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[DogeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -266,6 +295,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -442,7 +472,7 @@ def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -566,19 +596,17 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 92ffecc535d8..52603d99dcd4 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -30,10 +30,10 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, is_torch_flex_attn_available +from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging from ...utils.generic import OutputRecorder from ..llama.modeling_llama import ( LlamaForSequenceClassification, @@ -48,6 +48,8 @@ from ..mixtral.modeling_mixtral import MixtralForCausalLM, MixtralModel +logger = logging.get_logger(__name__) + if is_torch_flex_attn_available(): from torch.nn.attention.flex_attention import BlockMask @@ -84,41 +86,10 @@ class DogeConfig(PreTrainedConfig): Whether the model's input and output word embeddings should be tied. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. - NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly. - Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. - In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. - The original max position embeddings used during pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. - If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`). - Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`). - Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. num_attention_heads (`int`, *optional*, defaults to 8): Number of attention heads for each attention layer in the Transformer decoder. num_key_value_heads (`int`, *optional*): @@ -194,32 +165,31 @@ class DogeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32768, - hidden_size=1024, - intermediate_size=2048, - num_hidden_layers=32, - hidden_dropout=0.0, - hidden_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-06, - use_cache=True, - tie_word_embeddings=False, - max_position_embeddings=2048, - rope_theta=10000.0, - rope_scaling=None, - num_attention_heads=8, - num_key_value_heads=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - sliding_window=None, - keep_window_size=2048, - is_moe=False, - num_experts=16384, - num_experts_per_tok=64, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, + vocab_size: Optional[int] = 32768, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 32, + hidden_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "silu", + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-06, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + max_position_embeddings: Optional[int] = 2048, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + keep_window_size: Optional[int] = 2048, + is_moe: Optional[bool] = False, + num_experts: Optional[int] = 16384, + num_experts_per_tok: Optional[int] = 64, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, **kwargs, ): self.vocab_size = vocab_size @@ -234,8 +204,6 @@ def __init__( self.use_cache = use_cache self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.attention_bias = attention_bias @@ -249,11 +217,13 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # for backward compatibility @@ -362,6 +332,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -526,7 +497,7 @@ def __init__(self, config: DogeConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 092662e004fd..db524dd5789c 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -12,7 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -73,10 +76,10 @@ class Dots1Config(PreTrainedConfig): Whether or not the model should return the last key/values attentions. Only relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie the input and output word embeddings. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - Dictionary for scaling RoPE embeddings. Supports `{"type": strategy name, "factor": scaling factor}`. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the self-attention projections. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -136,34 +139,33 @@ class Dots1Config(PreTrainedConfig): def __init__( self, - vocab_size=152064, - hidden_size=4608, - intermediate_size=10944, - moe_intermediate_size=1408, - num_hidden_layers=62, - num_attention_heads=32, - num_key_value_heads=32, - n_shared_experts=None, - n_routed_experts=None, - n_group=1, - topk_group=1, - num_experts_per_tok=None, - first_k_dense_replace=0, - norm_topk_prob=False, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - routed_scaling_factor=1.0, - sliding_window=4096, - max_window_layers=62, - layer_types=None, + vocab_size: Optional[int] = 152064, + hidden_size: Optional[int] = 4608, + intermediate_size: Optional[int] = 10944, + moe_intermediate_size: Optional[int] = 1408, + num_hidden_layers: Optional[int] = 62, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + n_shared_experts: Optional[int] = None, + n_routed_experts: Optional[int] = None, + n_group: Optional[int] = 1, + topk_group: Optional[int] = 1, + num_experts_per_tok: Optional[int] = None, + first_k_dense_replace: Optional[int] = 0, + norm_topk_prob: Optional[bool] = False, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + routed_scaling_factor: Optional[float] = 1.0, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 62, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -187,14 +189,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.routed_scaling_factor = routed_scaling_factor self.sliding_window = sliding_window self.max_window_layers = max_window_layers + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ @@ -205,6 +209,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, diff --git a/src/transformers/models/dots1/modeling_dots1.py b/src/transformers/models/dots1/modeling_dots1.py index 1eb4c456cdc7..5c6c6ea450e0 100644 --- a/src/transformers/models/dots1/modeling_dots1.py +++ b/src/transformers/models/dots1/modeling_dots1.py @@ -67,20 +67,49 @@ class Dots1RotaryEmbedding(nn.Module): def __init__(self, config: Dots1Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Dots1Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -175,6 +204,7 @@ class Dots1Attention(nn.Module): def __init__(self, config: Dots1Config, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -197,7 +227,7 @@ def __init__(self, config: Dots1Config, layer_idx: int): ) self.q_norm = Dots1RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim! self.k_norm = Dots1RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -387,7 +417,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -506,19 +536,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 09d7a7b9a07a..8b57c903dde8 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import rope_config_validation, standardize_rope_params class EfficientLoFTRConfig(PreTrainedConfig): @@ -68,20 +68,13 @@ class EfficientLoFTRConfig(PreTrainedConfig): Kernel size used for the fine feature matching batch_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the batch normalization layers. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. partial_rotary_factor (`float`, *optional*, defaults to 4.0): Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for the whole hidden_size, so this factor is used to compensate. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3', '2d'], with 'default' being the original RoPE implementation. - `dim` (`int`): The dimension of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. fine_matching_slice_dim (`int`, *optional*, defaults to 8): The size of the slice used to divide the fine features for the first and second fine matching stages. fine_matching_regress_temperature (`float`, *optional*, defaults to 10.0): @@ -128,9 +121,8 @@ def __init__( coarse_matching_border_removal: int = 2, fine_kernel_size: int = 8, batch_norm_eps: float = 1e-5, - rope_theta: float = 10000.0, partial_rotary_factor: float = 4.0, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[dict] = None, fine_matching_slice_dim: int = 8, fine_matching_regress_temperature: float = 10.0, initializer_range: float = 0.02, @@ -184,14 +176,16 @@ def __init__( self.fine_matching_regress_temperature = fine_matching_regress_temperature self.num_key_value_heads = num_attention_heads - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling if rope_scaling is not None else {"rope_type": "default"} - - # for compatibility with "default" rope type self.partial_rotary_factor = partial_rotary_factor - rope_config_validation(self) - self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__(**kwargs) diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py index f47b7bc92971..0de81a3c6c67 100644 --- a/src/transformers/models/efficientloftr/modeling_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -84,23 +84,62 @@ def compute_embeddings(inv_freq: torch.Tensor, embed_height: int, embed_width: i return emb +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->EfficientLoFTR class EfficientLoFTRRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` + # Ignore copy def __init__(self, config: EfficientLoFTRConfig, device=None): super().__init__() + self.config = config - self.rope_type = config.rope_scaling["rope_type"] - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, _ = self.rope_init_fn(self.config, device) - inv_freq_expanded = inv_freq[None, None, None, :].float().expand(1, 1, 1, -1) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + # Ignore copy + def compute_default_rope_parameters( + config: Optional[EfficientLoFTRConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE - self.register_buffer("inv_freq", inv_freq_expanded, persistent=False) + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + # Ignore copy @torch.no_grad() def forward( - self, x: torch.Tensor, position_ids: Optional[tuple[torch.LongTensor, torch.LongTensor]] = None + self, x: torch.Tensor, position_ids: Optional[torch.LongTensor] = None, layer_type=None ) -> tuple[torch.Tensor, torch.Tensor]: feats_height, feats_width = x.shape[-2:] embed_height = (feats_height - self.config.q_aggregation_kernel_size) // self.config.q_aggregation_stride + 1 @@ -368,9 +407,7 @@ def forward( query_states = self.q_proj(hidden_states).view(batch_size, seq_len, -1, dim) - is_cross_attention = encoder_hidden_states is not None - current_states = encoder_hidden_states if is_cross_attention else hidden_states - + current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states key_states = self.k_proj(current_states).view(batch_size, seq_len, -1, dim) value_states = self.v_proj(current_states).view(batch_size, seq_len, -1, self.head_dim).transpose(1, 2) @@ -480,7 +517,7 @@ def __init__(self, config: EfficientLoFTRConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: batch_size, _, embed_dim, height, width = hidden_states.shape @@ -515,7 +552,7 @@ def __init__(self, config: EfficientLoFTRConfig): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: for layer in self.layers: @@ -739,7 +776,6 @@ def forward( coarse_features = self.local_feature_transformer( coarse_features, position_embeddings=position_embeddings, **kwargs ) - features = (coarse_features,) + tuple(residual_features) return BackboneOutput(feature_maps=features) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 3d737806ee18..634efd227f9e 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -14,10 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional, Union +from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Emu3VQVAEConfig(PreTrainedConfig): @@ -158,45 +158,10 @@ class Emu3TextConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. mlp_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. attention_bias (`bool`, *optional*, defaults to `False`): @@ -240,8 +205,7 @@ def __init__( bos_token_id: int = 151849, eos_token_id: int = 151850, tie_word_embeddings: bool = False, - rope_theta: float = 1000000.0, - rope_scaling: Optional[dict[str, Any]] = None, + rope_parameters: Optional[RopeParameters] = None, mlp_bias=False, attention_bias=False, attention_dropout: float = 0.1, @@ -258,14 +222,18 @@ def __init__( self.hidden_act = hidden_act self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.mlp_bias = mlp_bias self.attention_bias = attention_bias self.initializer_range = initializer_range - rope_config_validation(self) - self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py index 1427288878be..f61cd6a749c6 100644 --- a/src/transformers/models/emu3/convert_emu3_weights_to_hf.py +++ b/src/transformers/models/emu3/convert_emu3_weights_to_hf.py @@ -284,7 +284,7 @@ def convert_model(vq_model_id, llm_model_id, output_dir, hub_model_id=None, test text_config = Emu3TextConfig( max_position_embeddings=model_llm.config.max_position_embeddings, - rope_scaling={"rope_type": "default"}, + rope_parameters={"rope_type": "default"}, ) config = Emu3Config(text_config=text_config, vocabulary_map=vocabulary_map) diff --git a/src/transformers/models/emu3/modeling_emu3.py b/src/transformers/models/emu3/modeling_emu3.py index 007484748621..0f2cfec345f7 100644 --- a/src/transformers/models/emu3/modeling_emu3.py +++ b/src/transformers/models/emu3/modeling_emu3.py @@ -145,8 +145,8 @@ def __init__(self, config: Emu3Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -1110,20 +1110,49 @@ class Emu3RotaryEmbedding(nn.Module): def __init__(self, config: Emu3Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Emu3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1205,16 +1234,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index c8539c1adf97..03aefe766cf6 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -13,8 +13,10 @@ # limitations under the License. """Ernie 4.5 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Ernie4_5Config(PreTrainedConfig): @@ -66,45 +68,10 @@ class Ernie4_5Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in any of the projections including mlp and attention for example. head_dim (`int`, *optional*, defaults to 128): @@ -143,25 +110,24 @@ class Ernie4_5Config(PreTrainedConfig): def __init__( self, - vocab_size=103424, - hidden_size=1024, - intermediate_size=3072, - num_hidden_layers=18, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=True, - rope_theta=500000.0, - rope_scaling=None, - use_bias=False, - head_dim=128, + vocab_size: Optional[int] = 103424, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 3072, + num_hidden_layers: Optional[int] = 18, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[int] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_bias: Optional[bool] = False, + head_dim: Optional[int] = 128, **kwargs, ): self.vocab_size = vocab_size @@ -180,14 +146,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.use_bias = use_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/ernie4_5/modeling_ernie4_5.py b/src/transformers/models/ernie4_5/modeling_ernie4_5.py index 5ddac1de0b27..5658c7691c3c 100644 --- a/src/transformers/models/ernie4_5/modeling_ernie4_5.py +++ b/src/transformers/models/ernie4_5/modeling_ernie4_5.py @@ -44,20 +44,49 @@ class Ernie4_5RotaryEmbedding(nn.Module): def __init__(self, config: Ernie4_5Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Ernie4_5Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -196,8 +225,8 @@ def __init__(self, config: Ernie4_5Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -277,7 +306,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -381,16 +410,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/ernie4_5/modular_ernie4_5.py b/src/transformers/models/ernie4_5/modular_ernie4_5.py index 7cec0232ca68..780b07164ec0 100644 --- a/src/transformers/models/ernie4_5/modular_ernie4_5.py +++ b/src/transformers/models/ernie4_5/modular_ernie4_5.py @@ -23,12 +23,12 @@ LlamaAttention, LlamaForCausalLM, LlamaMLP, - LlamaRotaryEmbedding, ) +from ..olmo.modeling_olmo import OlmoRotaryEmbedding from .configuration_ernie4_5 import Ernie4_5Config -class Ernie4_5RotaryEmbedding(LlamaRotaryEmbedding): +class Ernie4_5RotaryEmbedding(OlmoRotaryEmbedding): @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index e7ebe24e0e87..0fd108a28b40 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -13,8 +13,10 @@ # limitations under the License. """Ernie 4.5 MoE model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -69,45 +71,10 @@ class Ernie4_5_MoeConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in any of the projections including mlp and attention for example. moe_intermediate_size (`int`, *optional*, defaults to 1536): @@ -179,34 +146,33 @@ class Ernie4_5_MoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=103424, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - hidden_size=2560, - intermediate_size=12288, - num_hidden_layers=28, - num_attention_heads=20, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=True, - rope_theta=500000.0, - rope_scaling=None, - use_bias=False, - moe_intermediate_size=1536, - moe_k=6, - moe_num_experts=64, - moe_num_shared_experts=2, - moe_layer_start_index=1, - moe_layer_end_index=-1, - moe_layer_interval=1, - moe_norm_min=1e-12, - output_router_logits=False, - router_aux_loss_coef=0.001, + vocab_size: Optional[int] = 103424, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 12288, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 20, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_bias: Optional[int] = False, + moe_intermediate_size: Optional[int] = 1536, + moe_k: Optional[int] = 6, + moe_num_experts: Optional[int] = 64, + moe_num_shared_experts: Optional[int] = 2, + moe_layer_start_index: Optional[int] = 1, + moe_layer_end_index: Optional[int] = -1, + moe_layer_interval: Optional[int] = 1, + moe_norm_min: Optional[int] = 1e-12, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, **kwargs, ): self.vocab_size = vocab_size @@ -221,13 +187,13 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.use_bias = use_bias + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments diff --git a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py index 6657ad1edd08..c2dbd8d436d8 100644 --- a/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py @@ -83,20 +83,49 @@ class Ernie4_5_MoeRotaryEmbedding(nn.Module): def __init__(self, config: Ernie4_5_MoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Ernie4_5_MoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -218,8 +247,8 @@ def __init__(self, config: Ernie4_5_MoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -386,7 +415,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 01d95d1c5862..218bc50ad964 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -14,8 +14,10 @@ # limitations under the License. """Evolla model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -132,9 +134,7 @@ class EvollaConfig(PreTrainedConfig): just in case (e.g., 512 or 1024 or 2048). rms_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon value for the RMS-norm layer in the llama model. - rope_theta (`float`, *optional*, defaults to 500000.0): - The threshold value for the RoPE layer in the llama model. - rope_scaling (`float`, *optional*): + rope_parameters (`float`, *optional*): The scaling factor for the RoPE layer in the llama model. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use bias in the attention layer. @@ -193,36 +193,35 @@ class EvollaConfig(PreTrainedConfig): def __init__( self, - protein_encoder_config=None, - vocab_size=128256, # llama vocab size - hidden_size=4096, # llama hidden size - intermediate_size=14336, # llama intermediate size - num_hidden_layers=32, # llama num layers - num_attention_heads=32, # llama num heads - num_key_value_heads=8, # llama num key-value heads - hidden_act="silu", # llama activation function - max_position_embeddings=8192, # llama rope max length - rms_norm_eps=1e-05, - rope_theta=500000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - aligner_ffn_mult=4, - aligner_enable_bias=True, - aligner_attention_probs_dropout_prob=0.1, - aligner_num_add_layers=8, - resampler_depth=6, - resampler_dim_head=64, - resampler_heads=8, - resampler_num_latents=64, - resampler_ff_mult=4, - initializer_range=0.02, - pad_token_id=None, - bos_token_id=128000, - eos_token_id=128009, - use_cache=False, - tie_word_embeddings=False, + protein_encoder_config: Optional[dict] = None, + vocab_size: Optional[int] = 128256, # llama vocab size + hidden_size: Optional[int] = 4096, # llama hidden size + intermediate_size: Optional[int] = 14336, # llama intermediate size + num_hidden_layers: Optional[int] = 32, # llama num layers + num_attention_heads: Optional[int] = 32, # llama num heads + num_key_value_heads: Optional[int] = 8, # llama num key-value heads + hidden_act: Optional[str] = "silu", # llama activation function + max_position_embeddings: Optional[int] = 8192, # llama rope max length + rms_norm_eps: Optional[int] = 1e-05, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + aligner_ffn_mult: Optional[int] = 4, + aligner_enable_bias: Optional[bool] = True, + aligner_attention_probs_dropout_prob: Optional[float] = 0.1, + aligner_num_add_layers: Optional[int] = 8, + resampler_depth: Optional[int] = 6, + resampler_dim_head: Optional[int] = 64, + resampler_heads: Optional[int] = 8, + resampler_num_latents: Optional[int] = 64, + resampler_ff_mult: Optional[int] = 4, + initializer_range: Optional[float] = 0.02, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128009, + use_cache: Optional[bool] = False, + tie_word_embeddings: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -250,13 +249,13 @@ def __init__( self.resampler_heads = resampler_heads self.resampler_num_latents = resampler_num_latents self.resampler_ff_mult = resampler_ff_mult + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # Subconfig diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index a92a5758bfdf..c405df1bb85c 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -982,20 +982,49 @@ class EvollaRotaryEmbedding(nn.Module): def __init__(self, config: EvollaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[EvollaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1104,8 +1133,8 @@ def __init__(self, config: EvollaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -1164,7 +1193,7 @@ def __init__(self, config: EvollaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -1268,8 +1297,8 @@ def __init__(self, config: EvollaConfig): ) self.norm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = EvollaRotaryEmbedding(config=config) self.gradient_checkpointing = getattr(config, "gradient_checkpointing", False) + self.rotary_emb = EvollaRotaryEmbedding(config=config) self.post_init() def get_input_embeddings(self): @@ -1349,9 +1378,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers: hidden_states = decoder_layer( @@ -1361,7 +1388,6 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, protein_kv_states=protein_feats, structure_kv_states=structure_feats, msa_kv_states=msa_feats, @@ -1369,6 +1395,7 @@ def forward( structure_batch_mask=structure_batch_mask, msa_batch_mask=msa_batch_mask, query_attn_mask=attention_mask, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index 3b1ceeee8238..51d327370ee3 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -669,7 +669,7 @@ def __init__(self, config: EvollaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -761,8 +761,8 @@ def __init__(self, config: EvollaConfig): ) self.norm = EvollaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = EvollaRotaryEmbedding(config=config) self.gradient_checkpointing = getattr(config, "gradient_checkpointing", False) + self.rotary_emb = EvollaRotaryEmbedding(config=config) self.post_init() def get_input_embeddings(self): @@ -842,9 +842,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers: hidden_states = decoder_layer( @@ -854,7 +852,6 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, protein_kv_states=protein_feats, structure_kv_states=structure_feats, msa_kv_states=msa_feats, @@ -862,6 +859,7 @@ def forward( structure_batch_mask=structure_batch_mask, msa_batch_mask=msa_batch_mask, query_attn_mask=attention_mask, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 7eac2bd588c3..68bdaf5ce9b3 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -19,7 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Exaone4Config(PreTrainedConfig): @@ -69,45 +72,10 @@ class Exaone4Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. sliding_window (`int`, *optional*): @@ -161,26 +129,25 @@ class Exaone4Config(PreTrainedConfig): def __init__( self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - layer_types=None, + vocab_size: Optional[int] = 102400, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 16384, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_dropout: Optional[float] = 0.0, + sliding_window: Optional[int] = 4096, + sliding_window_pattern: Optional[int] = 4, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -195,10 +162,11 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.sliding_window is None: @@ -214,6 +182,11 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) diff --git a/src/transformers/models/exaone4/modeling_exaone4.py b/src/transformers/models/exaone4/modeling_exaone4.py index c6a517f93001..efc82d192f02 100644 --- a/src/transformers/models/exaone4/modeling_exaone4.py +++ b/src/transformers/models/exaone4/modeling_exaone4.py @@ -73,20 +73,49 @@ class Exaone4RotaryEmbedding(nn.Module): def __init__(self, config: Exaone4Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Exaone4Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -191,7 +220,8 @@ def __init__(self, config: Exaone4Config, layer_idx: int): self.scaling = self.head_dim**-0.5 self.sliding_window = config.sliding_window self.sliding_window_pattern = config.sliding_window_pattern - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.is_sliding = layer_type == "sliding_attention" self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) @@ -208,6 +238,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -287,7 +318,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -399,19 +430,18 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) for i, decoder_layer in enumerate(self.layers): layer_type = self.config.layer_types[i] hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[layer_type], position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 9c72c14f5796..d03510d54d46 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,12 +30,14 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( TransformersKwargs, logging, ) +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..llama.modeling_llama import ( LlamaForCausalLM, LlamaForQuestionAnswering, @@ -44,7 +46,6 @@ LlamaModel, LlamaPreTrainedModel, LlamaRMSNorm, - LlamaRotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) @@ -104,45 +105,10 @@ class Exaone4Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. sliding_window (`int`, *optional*): @@ -196,26 +162,25 @@ class Exaone4Config(PreTrainedConfig): def __init__( self, - vocab_size=102400, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_dropout=0.0, - sliding_window=4096, - sliding_window_pattern=4, - layer_types=None, + vocab_size: Optional[int] = 102400, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 16384, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_dropout: Optional[float] = 0.0, + sliding_window: Optional[int] = 4096, + sliding_window_pattern: Optional[int] = 4, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -230,10 +195,11 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.sliding_window is None: @@ -249,6 +215,11 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) @@ -258,7 +229,7 @@ class Exaone4RMSNorm(LlamaRMSNorm): pass -class Exaone4RotaryEmbedding(LlamaRotaryEmbedding): +class Exaone4RotaryEmbedding(Gemma2RotaryEmbedding): pass @@ -277,7 +248,8 @@ def __init__(self, config: Exaone4Config, layer_idx: int): self.scaling = self.head_dim**-0.5 self.sliding_window = config.sliding_window self.sliding_window_pattern = config.sliding_window_pattern - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.is_sliding = layer_type == "sliding_attention" self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=False) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) @@ -294,6 +266,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -412,19 +385,18 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) for i, decoder_layer in enumerate(self.layers): layer_type = self.config.layer_types[i] hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[layer_type], position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 27e12b43dcac..2a6da686b72e 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -14,7 +14,10 @@ # limitations under the License. """Falcon configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -74,45 +77,10 @@ class FalconConfig(PreTrainedConfig): max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with, when `alibi` is `False`. Pretrained Falcon models with RoPE support up to 2048 tokens. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. bos_token_id (`int`, *optional*, defaults to 11): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 11): @@ -143,29 +111,28 @@ class FalconConfig(PreTrainedConfig): def __init__( self, - vocab_size=65024, - hidden_size=4544, - num_hidden_layers=32, - num_attention_heads=71, - num_ln_in_parallel_attn=None, - layer_norm_epsilon=1e-5, - initializer_range=0.02, - use_cache=True, - hidden_dropout=0.0, - attention_dropout=0.0, - num_kv_heads=None, - alibi=False, - new_decoder_architecture=False, - multi_query=True, - parallel_attn=True, - bias=False, - max_position_embeddings=2048, - rope_theta=10000.0, - rope_scaling=None, - bos_token_id=11, - eos_token_id=11, - ffn_hidden_size=None, - activation="gelu", + vocab_size: Optional[int] = 65024, + hidden_size: Optional[int] = 4544, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 71, + num_ln_in_parallel_attn: Optional[int] = None, + layer_norm_epsilon: Optional[int] = 1e-5, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + hidden_dropout: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + num_kv_heads: Optional[int] = None, + alibi: Optional[bool] = False, + new_decoder_architecture: Optional[bool] = False, + multi_query: Optional[bool] = True, + parallel_attn: Optional[bool] = True, + bias: Optional[bool] = False, + max_position_embeddings: Optional[int] = 2048, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + bos_token_id: Optional[int] = 11, + eos_token_id: Optional[int] = 11, + ffn_hidden_size: Optional[int] = None, + activation: Optional[str] = "gelu", **kwargs, ): self.vocab_size = vocab_size @@ -189,14 +156,21 @@ def __init__( self.bias = bias self.num_ln_in_parallel_attn = num_ln_in_parallel_attn self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.activation = activation if ffn_hidden_size is None: self.ffn_hidden_size = hidden_size * 4 else: self.ffn_hidden_size = ffn_hidden_size + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @property diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index 26d4fc1b621f..1b89172a19cd 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -15,6 +15,7 @@ """PyTorch Falcon model.""" import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -37,7 +38,10 @@ SequenceClassifierOutputWithPast, TokenClassifierOutput, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) from ...modeling_utils import PreTrainedModel from ...utils import ( auto_docstring, @@ -104,20 +108,49 @@ class FalconRotaryEmbedding(nn.Module): def __init__(self, config: FalconConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[FalconConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -194,7 +227,7 @@ def __init__(self, config: FalconConfig, layer_idx=None): self.split_size = self.hidden_size self.hidden_dropout = config.hidden_dropout self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.is_causal = True self.layer_idx = layer_idx if layer_idx is None: @@ -293,7 +326,7 @@ def forward( use_cache: bool = False, output_attentions: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ): fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads @@ -438,7 +471,7 @@ def forward( use_cache: bool = False, output_attentions: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ): fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] num_kv_heads = self.num_heads if self.new_decoder_architecture else self.num_kv_heads @@ -580,7 +613,7 @@ def forward( use_cache: bool = False, output_attentions: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ): residual = hidden_states @@ -688,10 +721,8 @@ def __init__(self, config: FalconConfig): # Final Layer Norm self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) - - self.rotary_emb = FalconRotaryEmbedding(config=config) - self.gradient_checkpointing = False + self.rotary_emb = FalconRotaryEmbedding(config=config) # Initialize weights and apply final processing self.post_init() @@ -779,9 +810,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index fc46100ab92e..85a7e76f3901 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -14,7 +14,10 @@ # limitations under the License. """FalconH1 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -103,9 +106,7 @@ class FalconH1Config(PreTrainedConfig): Whether to use RMSNorm instead of LayerNorm in the Mamba block projectors_bias (`bool`, *optional*, defaults to `False`): Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the attention block - rope_theta (`float`, *optional*, defaults to 100000.0): - The theta value used for the RoPE embeddings. - rope_scaling (`float`, *optional*): + rope_parameters (`float`, *optional*): The scaling value used for the RoPE embeddings. If `None`, no scaling is applied. lm_head_multiplier (`float`, *optional*, defaults to 1.0): The multiplier for the LM head. This is used to scale the output of the LM head. @@ -133,47 +134,46 @@ class FalconH1Config(PreTrainedConfig): def __init__( self, - vocab_size=128000, - tie_word_embeddings=False, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - num_logits_to_keep=1, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - max_position_embeddings=8192, - attention_dropout=0.0, - mamba_d_ssm=1024, - mamba_n_heads=128, - mamba_d_head="auto", - mamba_n_groups=1, - mamba_d_state=256, - mamba_d_conv=4, - mamba_expand=2, - mamba_chunk_size=256, - mamba_conv_bias=True, - mamba_proj_bias=False, - mamba_norm_before_gate=True, - mamba_rms_norm=False, - projectors_bias=False, - rope_theta=100000.0, - rope_scaling=None, - lm_head_multiplier=1.0, - embedding_multiplier=1.0, - mlp_multipliers=None, - key_multiplier=None, - attention_out_multiplier=None, - attention_in_multiplier=None, - ssm_multipliers=None, - ssm_in_multiplier=None, - ssm_out_multiplier=None, + vocab_size: Optional[int] = 128000, + tie_word_embeddings: Optional[bool] = False, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[int] = True, + num_logits_to_keep: Optional[int] = 1, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + max_position_embeddings: Optional[int] = 8192, + attention_dropout: Optional[float] = 0.0, + mamba_d_ssm: Optional[int] = 1024, + mamba_n_heads: Optional[int] = 128, + mamba_d_head: Optional[str] = "auto", + mamba_n_groups: Optional[int] = 1, + mamba_d_state: Optional[int] = 256, + mamba_d_conv: Optional[int] = 4, + mamba_expand: Optional[int] = 2, + mamba_chunk_size: Optional[int] = 256, + mamba_conv_bias: Optional[bool] = True, + mamba_proj_bias: Optional[bool] = False, + mamba_norm_before_gate: Optional[bool] = True, + mamba_rms_norm: Optional[bool] = False, + projectors_bias: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + lm_head_multiplier: Optional[float] = 1.0, + embedding_multiplier: Optional[float] = 1.0, + mlp_multipliers: Optional[int] = None, + key_multiplier: Optional[int] = None, + attention_out_multiplier: Optional[int] = None, + attention_in_multiplier: Optional[int] = None, + ssm_multipliers: Optional[int] = None, + ssm_in_multiplier: Optional[int] = None, + ssm_out_multiplier: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -197,10 +197,15 @@ def __init__( self.use_cache = use_cache self.num_logits_to_keep = num_logits_to_keep + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) - self.rope_theta = rope_theta - self.rope_scaling = None - self.rope_scaling = rope_scaling self.projectors_bias = projectors_bias mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm diff --git a/src/transformers/models/falcon_h1/modeling_falcon_h1.py b/src/transformers/models/falcon_h1/modeling_falcon_h1.py index d42e5d07ce25..451d2f68cb5b 100644 --- a/src/transformers/models/falcon_h1/modeling_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modeling_falcon_h1.py @@ -227,20 +227,49 @@ class FalconH1RotaryEmbedding(nn.Module): def __init__(self, config: FalconH1Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[FalconH1Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1085,7 +1114,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -1297,9 +1326,7 @@ def forward( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/src/transformers/models/falcon_h1/modular_falcon_h1.py b/src/transformers/models/falcon_h1/modular_falcon_h1.py index 6724ba5f7817..6dc6da878567 100644 --- a/src/transformers/models/falcon_h1/modular_falcon_h1.py +++ b/src/transformers/models/falcon_h1/modular_falcon_h1.py @@ -851,7 +851,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -1063,9 +1063,7 @@ def forward( attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions ) mamba_mask = self._update_mamba_mask(attention_mask, cache_position) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 655f4fe18390..0f0f63f2916b 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -19,9 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class FlexOlmoConfig(PreTrainedConfig): @@ -73,16 +74,10 @@ class FlexOlmoConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -131,30 +126,29 @@ class FlexOlmoConfig(PreTrainedConfig): def __init__( self, - vocab_size=100352, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-06, - use_cache=True, - pad_token_id=100277, - bos_token_id=None, - eos_token_id=100257, - tie_word_embeddings=False, - rope_theta=500000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - num_experts_per_tok=5, - num_experts=7, - output_router_logits=False, - router_aux_loss_coef=0.01, - norm_topk_prob=False, + vocab_size: Optional[int] = 100352, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-06, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 100277, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 100257, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 5, + num_experts: Optional[int] = 7, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.01, + norm_topk_prob: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -173,8 +167,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -182,10 +174,13 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/flex_olmo/modeling_flex_olmo.py b/src/transformers/models/flex_olmo/modeling_flex_olmo.py index 7693b24cc39b..4406e0bf03a9 100644 --- a/src/transformers/models/flex_olmo/modeling_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modeling_flex_olmo.py @@ -66,20 +66,49 @@ class FlexOlmoRotaryEmbedding(nn.Module): def __init__(self, config: FlexOlmoConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[FlexOlmoConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -93,7 +122,7 @@ def forward(self, x, position_ids): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() * self.attention_scaling sin = emb.sin() * self.attention_scaling - return cos, sin + return cos, sin class FlexOlmoMLP(nn.Module): @@ -220,6 +249,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 8a341b19ea4e..e5c738aa4bc5 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -21,6 +21,7 @@ from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs @@ -84,16 +85,10 @@ class FlexOlmoConfig(OlmoeConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 500000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -142,30 +137,29 @@ class FlexOlmoConfig(OlmoeConfig): def __init__( self, - vocab_size=100352, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-06, - use_cache=True, - pad_token_id=100277, - bos_token_id=None, - eos_token_id=100257, - tie_word_embeddings=False, - rope_theta=500000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - num_experts_per_tok=5, - num_experts=7, - output_router_logits=False, - router_aux_loss_coef=0.01, - norm_topk_prob=False, + vocab_size: Optional[int] = 100352, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-06, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 100277, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 100257, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 5, + num_experts: Optional[int] = 7, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.01, + norm_topk_prob: Optional[bool] = False, **kwargs, ): super().__init__( @@ -180,8 +174,7 @@ def __init__( initializer_range=initializer_range, rms_norm_eps=rms_norm_eps, use_cache=use_cache, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, num_experts_per_tok=num_experts_per_tok, @@ -198,6 +191,11 @@ def __init__( del self.clip_qkv + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. class FlexOlmoRMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index c13208e8825d..ae3b692cb474 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -14,7 +14,10 @@ # limitations under the License. """Fuyu model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -64,16 +67,10 @@ class FuyuConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. Whether to tie weight embeddings tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie input and output embeddings. - rope_theta (`float`, *optional*, defaults to 25000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. qk_layernorm (`bool`, *optional*, defaults to `True`): Whether or not to normalize the Queries and Keys after projecting the hidden states hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -107,31 +104,30 @@ class FuyuConfig(PreTrainedConfig): def __init__( self, - vocab_size=262144, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=36, - num_attention_heads=64, - hidden_act="relu2", - max_position_embeddings=16384, - image_size=300, - patch_size=30, - num_channels=3, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=25000.0, - rope_scaling=None, - qk_layernorm=True, - hidden_dropout=0.0, - attention_dropout=0.0, - partial_rotary_factor=0.5, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - image_token_id=71011, - text_config=None, + vocab_size: Optional[int] = 262144, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 16384, + num_hidden_layers: Optional[int] = 36, + num_attention_heads: Optional[int] = 64, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 16384, + image_size: Optional[int] = 300, + patch_size: Optional[int] = 30, + num_channels: Optional[int] = 3, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + qk_layernorm: Optional[bool] = True, + hidden_dropout: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + partial_rotary_factor: Optional[float] = 0.5, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + image_token_id: Optional[int] = 71011, + text_config: Optional[dict] = None, **kwargs, ): if text_config is None: @@ -146,8 +142,7 @@ def __init__( "initializer_range": initializer_range, "layer_norm_eps": layer_norm_eps, "use_cache": use_cache, - "rope_theta": rope_theta, - "rope_scaling": rope_scaling, + "rope_parameters": rope_parameters, "qk_layernorm": qk_layernorm, "hidden_dropout": hidden_dropout, "attention_dropout": attention_dropout, @@ -174,14 +169,19 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.partial_rotary_factor = partial_rotary_factor self.image_token_id = image_token_id - self._rope_scaling_validation() + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 25000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, @@ -191,25 +191,5 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - __all__ = ["FuyuConfig"] diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 6b248b76f3c1..986ab2c9aa94 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -19,7 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from typing import Optional + +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class GemmaConfig(PreTrainedConfig): @@ -72,14 +75,14 @@ class GemmaConfig(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - layer_types (`list`, *optional*): - Attention pattern for each layer. use_bidirectional_attention (`bool`, *optional*): If True, the model will attend to all text tokens instead of using a causal mask. @@ -112,27 +115,26 @@ class GemmaConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, - head_dim=256, - hidden_act="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - layer_types=None, - use_bidirectional_attention=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 24576, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 16, + head_dim: Optional[int] = 256, + hidden_act: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + use_bidirectional_attention: Optional[bool] = None, **kwargs, ): self.vocab_size = vocab_size @@ -147,15 +149,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = ["full_attention" for _ in range(self.num_hidden_layers)] - layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 686f66f186b7..335c2b2cf7b5 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -84,20 +84,49 @@ class GemmaRotaryEmbedding(nn.Module): def __init__(self, config: GemmaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GemmaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -216,8 +245,8 @@ def __init__(self, config: GemmaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -267,7 +296,6 @@ def __init__(self, config: GemmaConfig, layer_idx: int): self.mlp = GemmaMLP(config) self.input_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.post_attention_layernorm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.attention_type = config.layer_types[layer_idx] def forward( self, @@ -277,7 +305,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -380,22 +408,18 @@ def forward( # It may already have been prepared by e.g. `generate` if not isinstance(causal_mask_mapping := attention_mask, dict): - causal_mask_mapping = { - "full_attention": create_causal_mask( - config=self.config, - input_embeds=inputs_embeds, - attention_mask=attention_mask, - cache_position=cache_position, - past_key_values=past_key_values, - position_ids=position_ids, - ) - } + causal_mask_mapping = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # normalized # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 @@ -406,7 +430,7 @@ def forward( for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - attention_mask=causal_mask_mapping[decoder_layer.attention_type], + attention_mask=causal_mask_mapping, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 7ba39f9490de..cc4cf066958a 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -20,16 +20,16 @@ from torch import nn from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...tokenization_utils import AddedToken, PreTrainedTokenizer from ...utils import TransformersKwargs, logging from ..llama.modeling_llama import ( LlamaAttention, - LlamaDecoderLayer, LlamaForCausalLM, LlamaForSequenceClassification, LlamaForTokenClassification, @@ -102,14 +102,14 @@ class GemmaConfig(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - layer_types (`list`, *optional*): - Attention pattern for each layer. use_bidirectional_attention (`bool`, *optional*): If True, the model will attend to all text tokens instead of using a causal mask. @@ -142,27 +142,26 @@ class GemmaConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=3072, - intermediate_size=24576, - num_hidden_layers=28, - num_attention_heads=16, - num_key_value_heads=16, - head_dim=256, - hidden_act="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - layer_types=None, - use_bidirectional_attention=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 24576, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 16, + head_dim: Optional[int] = 256, + hidden_act: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + use_bidirectional_attention: Optional[bool] = None, **kwargs, ): self.vocab_size = vocab_size @@ -177,15 +176,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = ["full_attention" for _ in range(self.num_hidden_layers)] - layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, @@ -392,12 +393,6 @@ def __init__(self, config: GemmaConfig, layer_idx: int): self.is_causal = not getattr(config, "use_bidirectional_attention", False) -class GemmaDecoderLayer(LlamaDecoderLayer): - def __init__(self, config: GemmaConfig, layer_idx: int): - super().__init__() - self.attention_type = config.layer_types[layer_idx] - - class GemmaPreTrainedModel(LlamaPreTrainedModel): def _init_weights(self, module): PreTrainedModel._init_weights(self, module) @@ -439,22 +434,18 @@ def forward( # It may already have been prepared by e.g. `generate` if not isinstance(causal_mask_mapping := attention_mask, dict): - causal_mask_mapping = { - "full_attention": create_causal_mask( - config=self.config, - input_embeds=inputs_embeds, - attention_mask=attention_mask, - cache_position=cache_position, - past_key_values=past_key_values, - position_ids=position_ids, - ) - } + causal_mask_mapping = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=position_ids, + ) # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # normalized # Gemma downcasts the below to float16, causing sqrt(3072)=55.4256 to become 55.5 @@ -465,7 +456,7 @@ def forward( for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - attention_mask=causal_mask_mapping[decoder_layer.attention_type], + attention_mask=causal_mask_mapping, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index ef55c16e5d45..7fa77dbb8347 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -19,7 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Gemma2Config(PreTrainedConfig): @@ -73,8 +76,10 @@ class Gemma2Config(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -121,31 +126,31 @@ class Gemma2Config(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - use_bidirectional_attention=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, + use_bidirectional_attention: Optional[bool] = None, **kwargs, ): super().__init__( @@ -166,7 +171,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -176,6 +180,9 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -183,5 +190,10 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index 5f4d81093a9a..f824053201ad 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -88,20 +88,49 @@ class Gemma2RotaryEmbedding(nn.Module): def __init__(self, config: Gemma2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Gemma2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -205,6 +234,7 @@ class Gemma2Attention(nn.Module): def __init__(self, config: Gemma2Config, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -226,13 +256,13 @@ def __init__(self, config: Gemma2Config, layer_idx: int): config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -291,7 +321,7 @@ def __init__(self, config: Gemma2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -371,7 +401,7 @@ def __init__(self, config: Gemma2Config): [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Gemma2RotaryEmbedding(config=config) + self.rotary_emb = Gemma2RotaryEmbedding(config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -441,8 +471,6 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # normalized @@ -461,8 +489,8 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 7028123cf356..411c75ac516a 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -26,6 +26,13 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + RopeParameters, + dynamic_rope_update, + rope_config_validation, + standardize_rope_params, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -98,8 +105,10 @@ class Gemma2Config(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -146,31 +155,31 @@ class Gemma2Config(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, - use_bidirectional_attention=None, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, + use_bidirectional_attention: Optional[bool] = None, **kwargs, ): super().__init__( @@ -191,7 +200,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -201,6 +209,9 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -208,6 +219,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class Gemma2RMSNorm(GemmaRMSNorm): pass @@ -220,7 +236,36 @@ def __init__(self, config): class Gemma2RotaryEmbedding(GemmaRotaryEmbedding): - pass + def __init__(self, config: Gemma2Config, device=None): + nn.Module.__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) def eager_attention_forward( @@ -260,18 +305,20 @@ def eager_attention_forward( class Gemma2Attention(GemmaAttention): def __init__(self, config: Gemma2Config, layer_idx: int): + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + super().__init__(config, layer_idx) self.attn_logit_softcapping = self.config.attn_logit_softcapping self.attention_dropout = self.config.attention_dropout self.is_causal = not getattr(config, "use_bidirectional_attention", False) self.scaling = config.query_pre_attn_scalar**-0.5 - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -330,7 +377,7 @@ def __init__(self, config: Gemma2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -382,6 +429,7 @@ def __init__(self, config: Gemma2Config): self.layers = nn.ModuleList( [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) + self.rotary_emb = Gemma2RotaryEmbedding(config) def forward( self, @@ -445,8 +493,6 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # normalized @@ -465,8 +511,8 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 04483c5c38f1..3b9cd24bb46d 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..siglip import SiglipVisionConfig @@ -81,8 +81,6 @@ class Gemma3TextConfig(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -97,45 +95,10 @@ class Gemma3TextConfig(PreTrainedConfig): Scaling factor when applying tanh softcapping on the logits. attn_logit_softcapping (`float`, *optional*): Scaling factor when applying tanh softcapping on the attention scores. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_bidirectional_attention (`bool`, *optional*, defaults to `False`): If True, the model will attend to all text tokens instead of using a causal mask. This does not change behavior for vision tokens. @@ -170,33 +133,31 @@ class Gemma3TextConfig(PreTrainedConfig): def __init__( self, - vocab_size=262_208, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=131_072, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=1_000_000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=None, - attn_logit_softcapping=None, - rope_scaling=None, - rope_local_base_freq=10_000.0, - use_bidirectional_attention=False, + vocab_size: Optional[int] = 262_208, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 131_072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = None, + attn_logit_softcapping: Optional[float] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_bidirectional_attention: Optional[bool] = False, **kwargs, ): super().__init__( @@ -217,7 +178,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -226,14 +186,15 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + if rope_scaling is not None: + rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} + self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds - self.rope_local_base_freq = rope_local_base_freq - self.rope_scaling = rope_scaling - rope_config_validation(self) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) @@ -244,6 +205,14 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1_000_000.0) + rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + class Gemma3Config(PreTrainedConfig): r""" diff --git a/src/transformers/models/gemma3/convert_gemma3_weights.py b/src/transformers/models/gemma3/convert_gemma3_weights.py index aefd9648d3fe..b4b00dc22ec8 100644 --- a/src/transformers/models/gemma3/convert_gemma3_weights.py +++ b/src/transformers/models/gemma3/convert_gemma3_weights.py @@ -141,7 +141,7 @@ max_position_embeddings=1024, query_pre_attn_scalar=256, sliding_window=512, - rope_scaling=None, + rope_parameters=None, use_bidirectional_attention=True, ), vision_config=None, @@ -158,7 +158,7 @@ max_position_embeddings=32768, query_pre_attn_scalar=256, sliding_window=512, - rope_scaling=None, + rope_parameters=None, ), vision_config=None, ), @@ -190,7 +190,7 @@ num_hidden_layers=34, num_key_value_heads=4, sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only + rope_parameters={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only rope_theta=1_000_000, rope_local_base_freq=10_000, attn_logit_softcapping=None, @@ -208,7 +208,7 @@ num_hidden_layers=48, num_key_value_heads=8, sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only + rope_parameters={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only rope_theta=1_000_000, rope_local_base_freq=10_000, attn_logit_softcapping=None, @@ -226,7 +226,7 @@ num_key_value_heads=16, head_dim=128, sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only + rope_parameters={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only rope_theta=1_000_000, rope_local_base_freq=10_000, attn_logit_softcapping=None, diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index aa21cf995ebe..3632e2336c5b 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -19,7 +19,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy from collections.abc import Callable from dataclasses import dataclass from typing import Optional, Union @@ -145,35 +144,80 @@ def extra_repr(self): class Gemma3RotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config: Gemma3TextConfig, device=None): + def __init__(self, config: Gemma3TextConfig, device=None, layer_type=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.layer_types = list(set(config.layer_types)) + self.rope_type = {} + for layer_type in self.layer_types: + rope_params = self.config.rope_parameters[layer_type] + if rope_params is None: + continue + + self.rope_type[layer_type] = rope_params["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type[layer_type] != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] + curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) + self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) + setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq) + setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Gemma3TextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + layer_type (`str`, *optional*): + The current layer type if the model has different RoPE parameters per type. + Should not be used unless `config.layer_types is not None` + + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + base = config.rope_parameters[layer_type]["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + def forward(self, x, position_ids, layer_type=None): + inv_freq = getattr(self, f"{layer_type}_inv_freq") + attention_scaling = getattr(self, f"{layer_type}_attention_scaling") + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling + cos = emb.cos() * attention_scaling + sin = emb.sin() * attention_scaling return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @@ -264,7 +308,7 @@ class Gemma3Attention(nn.Module): def __init__(self, config: Gemma3TextConfig, layer_idx: int): super().__init__() - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -286,7 +330,8 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping - self.sliding_window = config.sliding_window if self.is_sliding else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + self.is_sliding = self.layer_type == "sliding_attention" self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) self.k_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) @@ -294,8 +339,8 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: torch.Tensor, - attention_mask: Optional[torch.Tensor], + position_embeddings: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -356,8 +401,7 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, + position_embeddings: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -370,12 +414,6 @@ def forward( hidden_states = self.input_layernorm(hidden_states) - # apply global RoPE to non-sliding layer only - if self.self_attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - hidden_states, self_attn_weights = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, @@ -466,16 +504,9 @@ def __init__(self, config: Gemma3TextConfig): [Gemma3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Gemma3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Gemma3RotaryEmbedding(config=config) + self.rotary_emb = Gemma3RotaryEmbedding(config) self.gradient_checkpointing = False - # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas - # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE - config = copy.deepcopy(config) - config.rope_theta = config.rope_local_base_freq - config.rope_scaling = {"rope_type": "default"} - self.rotary_emb_local = Gemma3RotaryEmbedding(config=config) - # Initialize weights and apply final processing self.post_init() @@ -518,9 +549,7 @@ def forward( if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( - past_seen_tokens, - past_seen_tokens + inputs_embeds.shape[1], - device=inputs_embeds.device, + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) if position_ids is None: @@ -551,10 +580,9 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings_global = self.rotary_emb(hidden_states, position_ids) - position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -566,9 +594,8 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings_global=position_embeddings_global, - position_embeddings_local=position_embeddings_local, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings[decoder_layer.attention_type], position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index a814d200a172..2aae16434083 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy from collections.abc import Callable from typing import Any, Optional, Union @@ -26,7 +25,13 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + RopeParameters, + dynamic_rope_update, + rope_config_validation, + standardize_rope_params, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -106,8 +111,6 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -122,45 +125,10 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): Scaling factor when applying tanh softcapping on the logits. attn_logit_softcapping (`float`, *optional*): Scaling factor when applying tanh softcapping on the attention scores. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_bidirectional_attention (`bool`, *optional*, defaults to `False`): If True, the model will attend to all text tokens instead of using a causal mask. This does not change behavior for vision tokens. @@ -180,33 +148,31 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): def __init__( self, - vocab_size=262_208, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=131_072, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=1_000_000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=None, - attn_logit_softcapping=None, - rope_scaling=None, - rope_local_base_freq=10_000.0, - use_bidirectional_attention=False, + vocab_size: Optional[int] = 262_208, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 131_072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = None, + attn_logit_softcapping: Optional[float] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_bidirectional_attention: Optional[bool] = False, **kwargs, ): PreTrainedConfig.__init__( @@ -227,7 +193,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -236,14 +201,15 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + if rope_scaling is not None: + rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} + self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds - self.rope_local_base_freq = rope_local_base_freq - self.rope_scaling = rope_scaling - rope_config_validation(self) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6) @@ -254,6 +220,14 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1_000_000.0) + rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + class Gemma3Config(PreTrainedConfig): r""" @@ -381,17 +355,90 @@ def __init__(self, dim: int, eps: float = 1e-6): class Gemma3RotaryEmbedding(Gemma2RotaryEmbedding): - def __init__(self, config: Gemma3TextConfig, device=None): - super().__init__(config) + def __init__(self, config: Gemma3TextConfig, device=None, layer_type=None): + nn.Module.__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.layer_types = list(set(config.layer_types)) + self.rope_type = {} + for layer_type in self.layer_types: + rope_params = self.config.rope_parameters[layer_type] + if rope_params is None: + continue + + self.rope_type[layer_type] = rope_params["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type[layer_type] != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] + curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) + self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) + setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq) + setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Gemma3TextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + layer_type (`str`, *optional*): + The current layer type if the model has different RoPE parameters per type. + Should not be used unless `config.layer_types is not None` + + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + base = config.rope_parameters[layer_type]["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids, layer_type=None): + inv_freq = getattr(self, f"{layer_type}_inv_freq") + attention_scaling = getattr(self, f"{layer_type}_attention_scaling") + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * attention_scaling + sin = emb.sin() * attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) # Weird way to inherit but otherwise the sliding window gets defined first and can't access `is_sliding` class Gemma3Attention(Gemma2Attention): def __init__(self, config: Gemma3TextConfig, layer_idx: int): - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" - super().__init__(config, layer_idx) - self.sliding_window = config.sliding_window if self.is_sliding else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + self.is_sliding = self.layer_type == "sliding_attention" self.is_causal = not self.config.use_bidirectional_attention self.q_norm = Gemma3RMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) @@ -400,8 +447,8 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: torch.Tensor, - attention_mask: Optional[torch.Tensor], + position_embeddings: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -462,8 +509,7 @@ def __init__(self, config: Gemma3TextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, + position_embeddings: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -476,12 +522,6 @@ def forward( hidden_states = self.input_layernorm(hidden_states) - # apply global RoPE to non-sliding layer only - if self.self_attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - hidden_states, self_attn_weights = self.self_attn( hidden_states=hidden_states, position_embeddings=position_embeddings, @@ -555,13 +595,6 @@ def __init__(self, config: Gemma3TextConfig): config.vocab_size, config.hidden_size, self.padding_idx, embed_scale=self.config.hidden_size**0.5 ) - # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas - # when we want to create a local RoPE layer. Config defaults should hold values for global RoPE - config = copy.deepcopy(config) - config.rope_theta = config.rope_local_base_freq - config.rope_scaling = {"rope_type": "default"} - self.rotary_emb_local = Gemma3RotaryEmbedding(config=config) - def forward( self, input_ids: Optional[torch.LongTensor] = None, @@ -599,9 +632,7 @@ def forward( if cache_position is None: past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 cache_position = torch.arange( - past_seen_tokens, - past_seen_tokens + inputs_embeds.shape[1], - device=inputs_embeds.device, + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) if position_ids is None: @@ -632,10 +663,9 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings_global = self.rotary_emb(hidden_states, position_ids) - position_embeddings_local = self.rotary_emb_local(hidden_states, position_ids) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -647,9 +677,8 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings_global=position_embeddings_global, - position_embeddings_local=position_embeddings_local, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings[decoder_layer.attention_type], position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index e3b36bab6128..cbc0e890d9cc 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import is_timm_available, logging, requires_backends @@ -90,47 +90,10 @@ class Gemma3nTextConfig(PreTrainedConfig): End of stream token id. bos_token_id (`int`, *optional*, defaults to 2): Beginning of stream token id. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. - NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we - recommend you to update this value accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -214,9 +177,7 @@ def __init__( pad_token_id: int = 0, eos_token_id: int = 1, bos_token_id: int = 2, - rope_theta: float = 1_000_000.0, - rope_scaling: Optional[dict[str, Any]] = None, - rope_local_base_freq: float = 10_000.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, attention_bias: bool = False, attention_dropout: float = 0.0, sliding_window: int = 512, @@ -258,17 +219,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - - self.rope_local_base_freq = rope_local_base_freq - self.rope_scaling = rope_scaling - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -279,6 +238,14 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 7717caa34b6b..90fcbafaa2e4 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -19,7 +19,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import math from collections.abc import Callable, Sequence from dataclasses import dataclass @@ -1143,42 +1142,6 @@ def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor: return self.forward(corrected) -class Gemma3nTextRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Gemma3nTextConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -1268,7 +1231,7 @@ class Gemma3nTextAttention(nn.Module): def __init__(self, config: Gemma3nTextConfig, layer_idx: int): super().__init__() - self.is_sliding = config.layer_types[layer_idx] == "sliding_attention" + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -1288,7 +1251,8 @@ def __init__(self, config: Gemma3nTextConfig, layer_idx: int): self.o_proj = nn.Linear( config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) - self.sliding_window = config.sliding_window if self.is_sliding else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + self.is_sliding = self.layer_type == "sliding_attention" self.q_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) self.k_norm = Gemma3nRMSNorm(dim=config.head_dim, eps=config.rms_norm_eps) @@ -1311,8 +1275,8 @@ def __init__(self, config: Gemma3nTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: torch.Tensor, - attention_mask: Optional[torch.Tensor], + position_embeddings: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -1321,7 +1285,6 @@ def forward( hidden_shape = (*input_shape, -1, self.config.head_dim) cos, sin = position_embeddings - query_states = self.q_proj(hidden_states).view(hidden_shape) query_states = self.q_norm(query_states) query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2) @@ -1407,9 +1370,8 @@ def __init__(self, config: Gemma3nTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, - per_layer_input: torch.Tensor, + position_embeddings: torch.Tensor = None, + per_layer_input: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -1424,17 +1386,11 @@ def forward( active_prediction_normed = self.input_layernorm(active_prediction) laurel_output = self.laurel(active_prediction_normed) - # apply global RoPE to non-sliding layer only - if self.self_attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - attn, self_attn_weights = self.self_attn( hidden_states=active_prediction_normed, - position_embeddings=position_embeddings, attention_mask=attention_mask, position_ids=position_ids, + position_embeddings=position_embeddings, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, @@ -1474,6 +1430,156 @@ def forward( return outputs +class Gemma3nMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +class Gemma3nAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: Gemma3nConfig, layer_idx: int): + super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = config.query_pre_attn_scalar**-0.5 + self.attention_dropout = self.config.attention_dropout + self.is_causal = not getattr(config, "use_bidirectional_attention", False) + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + self.attn_logit_softcapping = self.config.attn_logit_softcapping + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_values is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=self.attention_dropout if self.training else 0.0, + scaling=self.scaling, + sliding_window=self.sliding_window, + softcap=self.attn_logit_softcapping, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class Gemma3nDecoderLayer(GradientCheckpointingLayer): + def __init__(self, config: Gemma3nConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.config = config + self.attention_type = config.layer_types[layer_idx] + self.self_attn = Gemma3nAttention(config=config, layer_idx=layer_idx) + self.mlp = Gemma3nMLP(config) + self.input_layernorm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.pre_feedforward_layernorm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_feedforward_layernorm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, + ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + position_embeddings=position_embeddings, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + **kwargs, + ) + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.pre_feedforward_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + @auto_docstring class Gemma3nPreTrainedModel(PreTrainedModel): config: Gemma3nConfig @@ -1488,8 +1594,8 @@ class Gemma3nPreTrainedModel(PreTrainedModel): _can_compile_fullgraph = True _supports_attention_backend = True _can_record_outputs = { - "hidden_states": Gemma3nTextDecoderLayer, - "attentions": Gemma3nTextAttention, + "hidden_states": Gemma3nDecoderLayer, + "attentions": Gemma3nAttention, } def _init_weights(self, module): @@ -1502,6 +1608,87 @@ def _init_weights(self, module): module.correct_output_scale.data.zero_() +class Gemma3nRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Gemma3nTextConfig, device=None, layer_type=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.layer_types = list(set(config.layer_types)) + self.rope_type = {} + for layer_type in self.layer_types: + rope_params = self.config.rope_parameters[layer_type] + if rope_params is None: + continue + + self.rope_type[layer_type] = rope_params["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type[layer_type] != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] + curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) + self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) + setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq) + setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Gemma3nTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + layer_type (`str`, *optional*): + The current layer type if the model has different RoPE parameters per type. + Should not be used unless `config.layer_types is not None` + + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + base = config.rope_parameters[layer_type]["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids, layer_type=None): + inv_freq = getattr(self, f"{layer_type}_inv_freq") + attention_scaling = getattr(self, f"{layer_type}_attention_scaling") + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * attention_scaling + sin = emb.sin() * attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + @auto_docstring(custom_intro="The base Gemma 3n language model without a language modeling head.") class Gemma3nTextModel(Gemma3nPreTrainedModel): config: Gemma3nTextConfig @@ -1520,17 +1707,9 @@ def __init__(self, config: Gemma3nTextConfig): ) self.norm = Gemma3nRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config) + self.rotary_emb = Gemma3nRotaryEmbedding(config) self.gradient_checkpointing = False - # TODO (raushan): Fix this after RoPE refactor. For now we hack it by - # reassigning thetas when we want to create a local RoPE layer. Config - # defaults should hold values for global RoPE. - config = copy.deepcopy(config) - config.rope_theta = config.rope_local_base_freq - config.rope_scaling = {"rope_type": "default"} - self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config) - self.hidden_size = config.hidden_size self.hidden_size_per_layer_input = config.hidden_size_per_layer_input @@ -1638,10 +1817,6 @@ def forward( # embed positions hidden_states_0 = inputs_embeds - # Initialize RoPE embeddings - position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids) - position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids) - # Expand hidden_states to support per-layer inputs target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5 epsilon_tensor = torch.tensor(1e-5) @@ -1657,6 +1832,9 @@ def forward( temp_hidden_states.append(current_hidden_state) hidden_states = torch.stack(temp_hidden_states, dim=0) # [num_altup_inputs, batch, seq_len, hidden_size] + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -1671,8 +1849,7 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings_global, - position_embeddings_local, + position_embeddings[decoder_layer.attention_type], per_layer_input, attention_mask=causal_mask, position_ids=position_ids, diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index edf5fd4e3db4..70f6199445a4 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -13,7 +13,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import math from collections.abc import Callable, Sequence from typing import Any, Optional, Union @@ -28,7 +27,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -37,7 +36,6 @@ from ..gemma2.modeling_gemma2 import ( Gemma2MLP, Gemma2PreTrainedModel, - Gemma2RotaryEmbedding, eager_attention_forward, rotate_half, ) @@ -117,47 +115,10 @@ class Gemma3nTextConfig(Gemma2Config, PreTrainedConfig): End of stream token id. bos_token_id (`int`, *optional*, defaults to 2): Beginning of stream token id. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. - NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we - recommend you to update this value accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - rope_local_base_freq (float, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings for local attention. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -226,9 +187,7 @@ def __init__( pad_token_id: int = 0, eos_token_id: int = 1, bos_token_id: int = 2, - rope_theta: float = 1_000_000.0, - rope_scaling: Optional[dict[str, Any]] = None, - rope_local_base_freq: float = 10_000.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, attention_bias: bool = False, attention_dropout: float = 0.0, sliding_window: int = 512, @@ -270,17 +229,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - - self.rope_local_base_freq = rope_local_base_freq - self.rope_scaling = rope_scaling - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -291,6 +248,14 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers @@ -1702,10 +1667,6 @@ def scale_corrected_output(self, corrected: torch.Tensor) -> torch.Tensor: return self.forward(corrected) -class Gemma3nTextRotaryEmbedding(Gemma2RotaryEmbedding): - pass - - def apply_rotary_pos_emb( x: torch.Tensor, cos: torch.Tensor, @@ -1761,8 +1722,8 @@ def __init__(self, config: Gemma3nTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: torch.Tensor, - attention_mask: Optional[torch.Tensor], + position_embeddings: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -1771,7 +1732,6 @@ def forward( hidden_shape = (*input_shape, -1, self.config.head_dim) cos, sin = position_embeddings - query_states = self.q_proj(hidden_states).view(hidden_shape) query_states = self.q_norm(query_states) query_states = apply_rotary_pos_emb(query_states, cos, sin, unsqueeze_dim=2) @@ -1849,9 +1809,8 @@ def __init__(self, config: Gemma3nTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, - per_layer_input: torch.Tensor, + position_embeddings: torch.Tensor = None, + per_layer_input: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -1866,17 +1825,11 @@ def forward( active_prediction_normed = self.input_layernorm(active_prediction) laurel_output = self.laurel(active_prediction_normed) - # apply global RoPE to non-sliding layer only - if self.self_attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - attn, self_attn_weights = self.self_attn( hidden_states=active_prediction_normed, - position_embeddings=position_embeddings, attention_mask=attention_mask, position_ids=position_ids, + position_embeddings=position_embeddings, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, @@ -1971,15 +1924,6 @@ def __init__(self, config: Gemma3nTextConfig): self.register_buffer("per_layer_projection_scale", torch.tensor(self.hidden_size**-0.5), persistent=False) self.register_buffer("per_layer_input_scale", torch.rsqrt(torch.tensor(2.0)), persistent=False) - self.rotary_emb = Gemma3nTextRotaryEmbedding(config=config) - - # TODO (raushan): Fix this after RoPE refactor. For now we hack it by - # reassigning thetas when we want to create a local RoPE layer. Config - # defaults should hold values for global RoPE. - config = copy.deepcopy(config) - config.rope_theta = config.rope_local_base_freq - config.rope_scaling = {"rope_type": "default"} - self.rotary_emb_local = Gemma3nTextRotaryEmbedding(config=config) def get_per_layer_inputs(self, input_ids: torch.LongTensor) -> torch.Tensor: return self.embed_tokens_per_layer(input_ids).reshape( @@ -2090,10 +2034,6 @@ def forward( # embed positions hidden_states_0 = inputs_embeds - # Initialize RoPE embeddings - position_embeddings_global = self.rotary_emb(hidden_states_0, position_ids) - position_embeddings_local = self.rotary_emb_local(hidden_states_0, position_ids) - # Expand hidden_states to support per-layer inputs target_magnitude = torch.mean(hidden_states_0**2, dim=-1, keepdim=True) ** 0.5 epsilon_tensor = torch.tensor(1e-5) @@ -2109,6 +2049,9 @@ def forward( temp_hidden_states.append(current_hidden_state) hidden_states = torch.stack(temp_hidden_states, dim=0) # [num_altup_inputs, batch, seq_len, hidden_size] + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) # decoder layers all_hidden_states = () if output_hidden_states else None @@ -2123,8 +2066,7 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings_global, - position_embeddings_local, + position_embeddings[decoder_layer.attention_type], per_layer_input, attention_mask=causal_mask, position_ids=position_ids, diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 66a38610beb2..63685ce76729 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -14,7 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class GlmConfig(PreTrainedConfig): @@ -63,8 +66,10 @@ class GlmConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. pad_token_id (`int`, *optional*, defaults to 151329): Padding token id. eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`): @@ -101,26 +106,26 @@ class GlmConfig(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=13696, - num_hidden_layers=40, - num_attention_heads=32, - num_key_value_heads=2, - partial_rotary_factor=0.5, - head_dim=128, - hidden_act="silu", - attention_dropout=0.0, - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=0.00000015625, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - pad_token_id=151329, - eos_token_id=[151329, 151336, 151338], - bos_token_id=None, - attention_bias=True, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 13696, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 2, + partial_rotary_factor: Optional[float] = 0.5, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + attention_dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.00000015625, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + pad_token_id: Optional[int] = 151329, + eos_token_id: Optional[list[int]] = [151329, 151336, 151338], + bos_token_id: Optional[int] = None, + attention_bias: Optional[bool] = True, **kwargs, ): self.vocab_size = vocab_size @@ -136,9 +141,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index a316640b718a..f72268465ece 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -62,6 +62,73 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: return self.down_proj(up_states) +class GlmRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: GlmConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GlmConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -176,8 +243,8 @@ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -238,42 +305,6 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" -class GlmRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: GlmConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - class GlmDecoderLayer(GradientCheckpointingLayer): def __init__(self, config: GlmConfig, layer_idx: int): super().__init__() @@ -293,7 +324,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -397,16 +428,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 90730c0184a3..059cb296c972 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -24,6 +24,7 @@ LlamaForCausalLM, LlamaForSequenceClassification, LlamaForTokenClassification, + LlamaRotaryEmbedding, ) from ..phi3.modeling_phi3 import Phi3MLP from .configuration_glm import GlmConfig @@ -38,6 +39,40 @@ class GlmMLP(Phi3MLP): pass +class GlmRotaryEmbedding(LlamaRotaryEmbedding): + @staticmethod + def compute_default_rope_parameters( + config: Optional[GlmConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., 0::2] diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 10017e6d2ad9..026658fa0793 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -14,7 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Glm4Config(PreTrainedConfig): @@ -45,7 +48,8 @@ class Glm4Config(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -63,8 +67,10 @@ class Glm4Config(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. pad_token_id (`int`, *optional*, defaults to 151329): Padding token id. eos_token_id (`int` | `list`, *optional*, defaults to `[151329, 151336, 151338]`): @@ -101,26 +107,26 @@ class Glm4Config(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=13696, - num_hidden_layers=40, - num_attention_heads=32, - num_key_value_heads=2, - partial_rotary_factor=0.5, - head_dim=128, - hidden_act="silu", - attention_dropout=0.0, - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=0.00000015625, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - pad_token_id=151329, - eos_token_id=[151329, 151336, 151338], - bos_token_id=None, - attention_bias=True, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 13696, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 2, + partial_rotary_factor: Optional[float] = 0.5, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + attention_dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.00000015625, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + pad_token_id: Optional[int] = 151329, + eos_token_id: Optional[list[int]] = [151329, 151336, 151338], + bos_token_id: Optional[int] = None, + attention_bias: Optional[bool] = True, **kwargs, ): self.vocab_size = vocab_size @@ -136,9 +142,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index 4baf729c0edc..935a722fd1db 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -83,7 +83,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -225,8 +225,8 @@ def __init__(self, config: Glm4Config, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -266,46 +266,56 @@ def forward( return attn_output, attn_weights -@use_kernel_forward_from_hub("RMSNorm") -class Glm4RMSNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-6): - """ - Glm4RMSNorm is equivalent to T5LayerNorm - """ - super().__init__() - self.weight = nn.Parameter(torch.ones(hidden_size)) - self.variance_epsilon = eps - - def forward(self, hidden_states): - input_dtype = hidden_states.dtype - hidden_states = hidden_states.to(torch.float32) - variance = hidden_states.pow(2).mean(-1, keepdim=True) - hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) - return self.weight * hidden_states.to(input_dtype) - - def extra_repr(self): - return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" - - class Glm4RotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: Glm4Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Glm4Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -323,6 +333,27 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) +@use_kernel_forward_from_hub("RMSNorm") +class Glm4RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Glm4RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + @auto_docstring class Glm4PreTrainedModel(PreTrainedModel): config: Glm4Config @@ -401,16 +432,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/glm4/modular_glm4.py b/src/transformers/models/glm4/modular_glm4.py index 7b0ee26811a7..ff03376eb435 100644 --- a/src/transformers/models/glm4/modular_glm4.py +++ b/src/transformers/models/glm4/modular_glm4.py @@ -58,7 +58,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index c86066daa98d..a35dec5f4e3f 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -19,8 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Glm4MoeConfig(PreTrainedConfig): @@ -68,45 +70,10 @@ class Glm4MoeConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -172,33 +139,32 @@ class Glm4MoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=10944, - num_hidden_layers=46, - num_attention_heads=96, - partial_rotary_factor=0.5, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - moe_intermediate_size=1408, - num_experts_per_tok=8, - n_shared_experts=1, - n_routed_experts=128, - routed_scaling_factor=1.0, - n_group=1, - topk_group=1, - first_k_dense_replace=1, - norm_topk_prob=True, - use_qk_norm=False, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 10944, + num_hidden_layers: Optional[int] = 46, + num_attention_heads: Optional[int] = 96, + partial_rotary_factor: Optional[float] = 0.5, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 8, + n_shared_experts: Optional[int] = 1, + n_routed_experts: Optional[int] = 128, + routed_scaling_factor: Optional[float] = 1.0, + n_group: Optional[int] = 1, + topk_group: Optional[int] = 1, + first_k_dense_replace: Optional[int] = 1, + norm_topk_prob: Optional[bool] = True, + use_qk_norm: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -214,14 +180,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py index 8f971f126f12..109c1f99b71f 100644 --- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py @@ -42,6 +42,73 @@ from .configuration_glm4_moe import Glm4MoeConfig +class Glm4MoeRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Glm4MoeConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Glm4MoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -135,7 +202,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None): self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.scaling = self.head_dim**-0.5 - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.attention_dropout = config.attention_dropout self.is_causal = True @@ -374,7 +441,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -423,42 +490,6 @@ def _init_weights(self, module): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) -class Glm4MoeRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Glm4MoeConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class Glm4MoeModel(Glm4MoePreTrainedModel): _keys_to_ignore_on_load_unexpected = [r"model\.layers\.92.*", r"model\.layers\.46.*"] @@ -520,16 +551,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 56cf3f08dd73..db1f22e58e45 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -32,6 +32,7 @@ DeepseekV3RMSNorm, DeepseekV3TopkRouter, ) +from ..glm.modeling_glm import GlmRotaryEmbedding from ..gpt_neox.modeling_gpt_neox import apply_rotary_pos_emb # noqa @@ -83,45 +84,10 @@ class Glm4MoeConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -187,33 +153,32 @@ class Glm4MoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=10944, - num_hidden_layers=46, - num_attention_heads=96, - partial_rotary_factor=0.5, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - moe_intermediate_size=1408, - num_experts_per_tok=8, - n_shared_experts=1, - n_routed_experts=128, - routed_scaling_factor=1.0, - n_group=1, - topk_group=1, - first_k_dense_replace=1, - norm_topk_prob=True, - use_qk_norm=False, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 10944, + num_hidden_layers: Optional[int] = 46, + num_attention_heads: Optional[int] = 96, + partial_rotary_factor: Optional[float] = 0.5, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 8, + n_shared_experts: Optional[int] = 1, + n_routed_experts: Optional[int] = 128, + routed_scaling_factor: Optional[float] = 1.0, + n_group: Optional[int] = 1, + topk_group: Optional[int] = 1, + first_k_dense_replace: Optional[int] = 1, + norm_topk_prob: Optional[bool] = True, + use_qk_norm: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -229,14 +194,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments @@ -257,6 +223,10 @@ def __init__( ) +class Glm4MoeRotaryEmbedding(GlmRotaryEmbedding): + pass + + class Glm4MoeAttention(CohereAttention): def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None): nn.Module.__init__(self) @@ -265,7 +235,7 @@ def __init__(self, config: Glm4MoeConfig, layer_idx: Optional[int] = None): self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.scaling = self.head_dim**-0.5 - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.attention_dropout = config.attention_dropout self.is_causal = True diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 8128065148c5..e316c14079bd 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -18,8 +18,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Glm4vVisionConfig(PreTrainedConfig): @@ -161,29 +163,12 @@ class Glm4vTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. image_token_id (`int`, *optional*): Token index used as placeholder for image embeddings. video_token_id (`int`, *optional*): @@ -222,23 +207,22 @@ class Glm4vTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=13696, - num_hidden_layers=40, - num_attention_heads=32, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - attention_dropout=0.0, - rope_scaling=None, - image_token_id=None, - video_token_id=None, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 13696, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + image_token_id: Optional[int] = None, + video_token_id: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -257,14 +241,14 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py index 946dac551519..966ef9506027 100644 --- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py @@ -626,8 +626,8 @@ def merge_tp_weights(model_path, output_path, vllm_config_path=None): } hf_config["vision_config"] = vision_config - if "rope_scaling" in model_config: - hf_config["rope_scaling"] = model_config["rope_scaling"] + if "rope_parameters" in model_config: + hf_config["rope_parameters"] = model_config["rope_parameters"] config_path = os.path.join(output_path, "config.json") with open(config_path, "w") as f: diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index d9506361ead6..26ab87bf0ec7 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -391,25 +391,56 @@ class Glm4vTextRotaryEmbedding(nn.Module): def __init__(self, config: Glm4vTextConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Glm4vTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): - # In contrast to other models, Glm4vText has different position ids for the grids + # In contrast to other models, GLM4V different position ids for the grids # So we expand the inv_freq to shape (3, ...) inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) @@ -506,7 +537,7 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) @@ -517,9 +548,8 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -536,7 +566,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -596,7 +626,7 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -869,18 +899,16 @@ def forward( causal_mask = create_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers: layer_outputs = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = layer_outputs diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 104f59860291..9f3d2debdc75 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,14 +31,14 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging from ...utils.generic import check_model_inputs from ...video_utils import VideoInput -from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, eager_attention_forward +from ..glm4.modeling_glm4 import Glm4MLP, Glm4RMSNorm, Glm4RotaryEmbedding, eager_attention_forward from ..qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionPatchEmbed, Qwen2_5_VisionRotaryEmbedding, @@ -48,7 +48,6 @@ Qwen2_5_VLModel, Qwen2_5_VLModelOutputWithPast, Qwen2_5_VLPreTrainedModel, - Qwen2_5_VLRotaryEmbedding, Qwen2_5_VLTextModel, Qwen2_5_VLVisionAttention, Qwen2_5_VLVisionBlock, @@ -201,29 +200,12 @@ class Glm4vTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. image_token_id (`int`, *optional*): Token index used as placeholder for image embeddings. video_token_id (`int`, *optional*): @@ -262,23 +244,22 @@ class Glm4vTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151552, - hidden_size=4096, - intermediate_size=13696, - num_hidden_layers=40, - num_attention_heads=32, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - attention_dropout=0.0, - rope_scaling=None, - image_token_id=None, - video_token_id=None, + vocab_size: Optional[int] = 151552, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 13696, + num_hidden_layers: Optional[int] = 40, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + image_token_id: Optional[int] = None, + video_token_id: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -297,14 +278,14 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id @@ -538,8 +519,22 @@ def __init__(self, config) -> None: self.mlp = Glm4VisionMlp(config, bias=False) -class Glm4vTextRotaryEmbedding(Qwen2_5_VLRotaryEmbedding): - pass +class Glm4vTextRotaryEmbedding(Glm4RotaryEmbedding): + # Ignore copy + def forward(self, x, position_ids): + # In contrast to other models, GLM4V different position ids for the grids + # So we expand the inv_freq to shape (3, ...) + inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) + position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) def rotate_half_llm(x): @@ -624,7 +619,7 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) @@ -635,9 +630,8 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -654,7 +648,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -700,7 +694,7 @@ def __init__(self, config: Glm4vTextConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -934,18 +928,16 @@ def forward( causal_mask = create_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers: layer_outputs = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = layer_outputs diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 378caebb496e..dc0923801243 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -18,8 +18,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -162,27 +164,10 @@ class Glm4vMoeTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -244,33 +229,32 @@ class Glm4vMoeTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151424, - hidden_size=4096, - intermediate_size=10944, - num_hidden_layers=46, - num_attention_heads=96, - partial_rotary_factor=0.5, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=65536, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=True, - attention_dropout=0.0, - moe_intermediate_size=1408, - num_experts_per_tok=8, - n_shared_experts=1, - n_routed_experts=128, - routed_scaling_factor=1.0, - n_group=1, - topk_group=1, - first_k_dense_replace=1, - norm_topk_prob=True, - router_aux_loss_coef=0.0001, + vocab_size: Optional[int] = 151424, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 10944, + num_hidden_layers: Optional[int] = 46, + num_attention_heads: Optional[int] = 96, + partial_rotary_factor: Optional[float] = 0.5, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 65536, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = True, + attention_dropout: Optional[float] = 0.0, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 8, + n_shared_experts: Optional[int] = 1, + n_routed_experts: Optional[int] = 128, + routed_scaling_factor: Optional[float] = 1.0, + n_group: Optional[int] = 1, + topk_group: Optional[int] = 1, + first_k_dense_replace: Optional[int] = 1, + norm_topk_prob: Optional[bool] = True, + router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -287,14 +271,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 372a9a88d0bd..c7fe8e9d0145 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -65,30 +65,85 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" +@dataclass +@auto_docstring( + custom_intro=""" + Base class for Llava outputs, with hidden states and attentions. + """ +) +class Glm4vMoeModelOutputWithPast(ModelOutput): + r""" + past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): + The rope index difference between sequence length and multimodal rope. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Cache] = None + hidden_states: Optional[tuple[torch.FloatTensor]] = None + attentions: Optional[tuple[torch.FloatTensor]] = None + rope_deltas: Optional[torch.LongTensor] = None + + class Glm4vMoeTextRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config: Glm4vMoeTextConfig, device=None): + def __init__(self, config: Glm4vMoeTextConfig, device=None, layer_type=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Glm4vMoeTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): - # In contrast to other models, Glm4vMoeText has different position ids for the grids + # In contrast to other models, GLM4V_MOE different position ids for the grids # So we expand the inv_freq to shape (3, ...) inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1) position_ids_expanded = position_ids[:, :, None, :].float() # shape (3, bs, 1, positions) @@ -103,30 +158,6 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -@dataclass -@auto_docstring( - custom_intro=""" - Base class for Llava outputs, with hidden states and attentions. - """ -) -class Glm4vMoeModelOutputWithPast(ModelOutput): - r""" - past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). - - Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see - `past_key_values` input) to speed up sequential decoding. - rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): - The rope index difference between sequence length and multimodal rope. - """ - - last_hidden_state: Optional[torch.FloatTensor] = None - past_key_values: Optional[Cache] = None - hidden_states: Optional[tuple[torch.FloatTensor]] = None - attentions: Optional[tuple[torch.FloatTensor]] = None - rope_deltas: Optional[torch.LongTensor] = None - - def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -248,7 +279,7 @@ def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int] = None): config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias ) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters def forward( self, @@ -272,7 +303,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -468,7 +499,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index fbcc1de59611..2ae2ead4ef08 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -23,7 +23,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -65,10 +65,6 @@ class Glm4vMoeRMSNorm(Glm4MoeRMSNorm): pass -class Glm4vMoeTextRotaryEmbedding(Glm4vTextRotaryEmbedding): - pass - - class Glm4vMoeTextConfig(Glm4MoeConfig): r""" This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a @@ -112,27 +108,10 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `True`, *optional*, defaults to `True`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -191,33 +170,32 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): def __init__( self, - vocab_size=151424, - hidden_size=4096, - intermediate_size=10944, - num_hidden_layers=46, - num_attention_heads=96, - partial_rotary_factor=0.5, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=65536, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=True, - attention_dropout=0.0, - moe_intermediate_size=1408, - num_experts_per_tok=8, - n_shared_experts=1, - n_routed_experts=128, - routed_scaling_factor=1.0, - n_group=1, - topk_group=1, - first_k_dense_replace=1, - norm_topk_prob=True, - router_aux_loss_coef=0.0001, + vocab_size: Optional[int] = 151424, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 10944, + num_hidden_layers: Optional[int] = 46, + num_attention_heads: Optional[int] = 96, + partial_rotary_factor: Optional[float] = 0.5, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 65536, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = True, + attention_dropout: Optional[float] = 0.0, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 8, + n_shared_experts: Optional[int] = 1, + n_routed_experts: Optional[int] = 128, + routed_scaling_factor: Optional[float] = 1.0, + n_group: Optional[int] = 1, + topk_group: Optional[int] = 1, + first_k_dense_replace: Optional[int] = 1, + norm_topk_prob: Optional[bool] = True, + router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): PreTrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -234,14 +212,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) # MoE arguments @@ -371,10 +350,47 @@ def apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim return q_embed, k_embed +class Glm4vMoeTextRotaryEmbedding(Glm4vTextRotaryEmbedding): + def __init__(self, config: Glm4vMoeTextConfig, device=None, layer_type=None): + super().__init__(config, device=device, layer_type=layer_type) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Glm4vMoeTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + class Glm4vMoeTextAttention(Glm4Attention): def __init__(self, config: Glm4vMoeTextConfig, layer_idx: Optional[int] = None): super().__init__(config, layer_idx) - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters def forward( self, @@ -398,7 +414,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( # diff with Llama - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.rope_parameters["mrope_section"] ) if past_key_values is not None: diff --git a/src/transformers/models/got_ocr2/configuration_got_ocr2.py b/src/transformers/models/got_ocr2/configuration_got_ocr2.py index 524226b099b2..35d2f5baa0dd 100644 --- a/src/transformers/models/got_ocr2/configuration_got_ocr2.py +++ b/src/transformers/models/got_ocr2/configuration_got_ocr2.py @@ -20,6 +20,8 @@ # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig from ..auto import CONFIG_MAPPING, AutoConfig @@ -160,11 +162,11 @@ class GotOcr2Config(PreTrainedConfig): def __init__( self, - vision_config=None, - text_config=None, - image_token_index=151859, - image_seq_length=576, - pad_token_id=-1, + vision_config: Optional[dict] = None, + text_config: Optional[dict] = None, + image_token_index: Optional[int] = 151859, + image_seq_length: Optional[int] = 576, + pad_token_id: Optional[int] = -1, **kwargs, ): self.image_token_index = image_token_index @@ -196,7 +198,7 @@ def __init__( use_cache=True, tie_word_embeddings=True, rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, use_sliding_window=False, sliding_window=4096, max_window_layers=21, diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index 9c3bce47fff0..84ffa0c22c59 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -181,11 +181,11 @@ class GotOcr2Config(PreTrainedConfig): def __init__( self, - vision_config=None, - text_config=None, - image_token_index=151859, - image_seq_length=576, - pad_token_id=-1, + vision_config: Optional[dict] = None, + text_config: Optional[dict] = None, + image_token_index: Optional[int] = 151859, + image_seq_length: Optional[int] = 576, + pad_token_id: Optional[int] = -1, **kwargs, ): self.image_token_index = image_token_index @@ -217,7 +217,7 @@ def __init__( use_cache=True, tie_word_embeddings=True, rope_theta=1000000.0, - rope_scaling=None, + rope_parameters=None, use_sliding_window=False, sliding_window=4096, max_window_layers=21, diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 1e1210be7117..1a2dafdf2668 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -14,8 +14,10 @@ # limitations under the License. """GPTNeoX model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -50,8 +52,6 @@ class GPTNeoXConfig(PreTrainedConfig): `"relu"`, `"selu"` and `"gelu_new"` are supported. rotary_pct (`float`, *optional*, defaults to 0.25): percentage of hidden dimensions to allocate to rotary embeddings - rotary_emb_base (`int`, *optional*, defaults to 10000) - base for computing rotary embeddings frequency attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio probability of the attention score. hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -74,43 +74,10 @@ class GPTNeoXConfig(PreTrainedConfig): use_parallel_residual (`bool`, *optional*, defaults to `True`): Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training speedup at large scales (e.g. 20B). - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `True`): Whether to use a bias in the query, key, value and output projection layers during self-attention. @@ -146,27 +113,26 @@ class GPTNeoXConfig(PreTrainedConfig): def __init__( self, - vocab_size=50432, - hidden_size=6144, - num_hidden_layers=44, - num_attention_heads=64, - intermediate_size=24576, - hidden_act="gelu", - rotary_pct=0.25, - rotary_emb_base=10000, - attention_dropout=0.0, - hidden_dropout=0.0, - classifier_dropout=0.1, - max_position_embeddings=2048, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - bos_token_id=0, - eos_token_id=2, - tie_word_embeddings=False, - use_parallel_residual=True, - rope_scaling=None, - attention_bias=True, + vocab_size: Optional[int] = 50432, + hidden_size: Optional[int] = 6144, + num_hidden_layers: Optional[int] = 44, + num_attention_heads: Optional[int] = 64, + intermediate_size: Optional[int] = 24576, + hidden_act: Optional[str] = "gelu", + rotary_pct: Optional[float] = 0.25, + attention_dropout: Optional[float] = 0.0, + hidden_dropout: Optional[float] = 0.0, + classifier_dropout: Optional[float] = 0.1, + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + use_parallel_residual: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = True, **kwargs, ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -179,8 +145,6 @@ def __init__( self.hidden_act = hidden_act self.rotary_pct = rotary_pct self.partial_rotary_factor = rotary_pct - self.rotary_emb_base = rotary_emb_base - self.rope_theta = rotary_emb_base self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout self.classifier_dropout = classifier_dropout @@ -189,14 +153,18 @@ def __init__( self.use_cache = use_cache self.tie_word_embeddings = tie_word_embeddings self.use_parallel_residual = use_parallel_residual - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.attention_bias = attention_bias + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rotary_emb_base", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) - if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fe9ebd84114e..719ec08ce3e6 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -49,6 +49,73 @@ def forward(self, hidden_states): return hidden_states +class GPTNeoXRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: GPTNeoXConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GPTNeoXConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -140,9 +207,8 @@ def forward( hidden_states: torch.FloatTensor, attention_mask: torch.FloatTensor, layer_past: Optional[Cache] = None, - output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ): input_shape = hidden_states.shape[:-1] @@ -207,7 +273,7 @@ def forward( layer_past: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ): attn_output, attn_weights = self.attention( @@ -216,7 +282,6 @@ def forward( position_ids=position_ids, layer_past=layer_past, use_cache=use_cache, - output_attentions=output_attentions, cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, @@ -245,42 +310,6 @@ def forward( return outputs -class GPTNeoXRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: GPTNeoXConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @use_kernel_forward_from_hub("RMSNorm") class GPTNeoXRMSNorm(nn.Module): def __init__(self, hidden_size, eps=1e-6): @@ -321,7 +350,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -439,9 +468,7 @@ def forward( ) hidden_states = self.emb_dropout(inputs_embeds) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None @@ -455,9 +482,9 @@ def forward( position_ids=position_ids, layer_past=past_key_values, use_cache=use_cache, + position_embeddings=position_embeddings, output_attentions=output_attentions, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) hidden_states = outputs[0] diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index 6442dab84b69..dfd877825363 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -21,6 +21,7 @@ from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging from ..llama.modeling_llama import LlamaModel, LlamaPreTrainedModel, LlamaRotaryEmbedding, rotate_half +from .configuration_gpt_neox import GPTNeoXConfig logger = logging.get_logger(__name__) @@ -40,6 +41,40 @@ def forward(self, hidden_states): return hidden_states +class GPTNeoXRotaryEmbedding(LlamaRotaryEmbedding): + @staticmethod + def compute_default_rope_parameters( + config: Optional[GPTNeoXConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. @@ -124,9 +159,8 @@ def forward( hidden_states: torch.FloatTensor, attention_mask: torch.FloatTensor, layer_past: Optional[Cache] = None, - output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ): input_shape = hidden_states.shape[:-1] @@ -191,7 +225,7 @@ def forward( layer_past: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ): attn_output, attn_weights = self.attention( @@ -200,7 +234,6 @@ def forward( position_ids=position_ids, layer_past=layer_past, use_cache=use_cache, - output_attentions=output_attentions, cache_position=cache_position, position_embeddings=position_embeddings, **kwargs, @@ -229,10 +262,6 @@ def forward( return outputs -class GPTNeoXRotaryEmbedding(LlamaRotaryEmbedding): - pass - - class GPTNeoXPreTrainedModel(LlamaPreTrainedModel): base_model_prefix = "gpt_neox" _no_split_modules = ["GPTNeoXLayer"] @@ -318,9 +347,7 @@ def forward( ) hidden_states = self.emb_dropout(inputs_embeds) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None @@ -334,9 +361,9 @@ def forward( position_ids=position_ids, layer_past=past_key_values, use_cache=use_cache, + position_embeddings=position_embeddings, output_attentions=output_attentions, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) hidden_states = outputs[0] diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index b8343b73510d..f09bc8810da0 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -14,8 +14,10 @@ # limitations under the License. """GPTNeoX Japanese model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -49,8 +51,6 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): The non-linear activation function (function or string) in the encoder and pooler. rotary_pct (`float`, *optional*, defaults to 1.00): percentage of hidden dimensions to allocate to rotary embeddings - rotary_emb_base (`int`, *optional*, defaults to 10000) - base for computing rotary embeddings frequency max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): @@ -60,43 +60,10 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_dropout (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention. hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -120,23 +87,22 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=2560, - num_hidden_layers=32, - num_attention_heads=32, - intermediate_multiple_size=4, - hidden_act="gelu", - rotary_pct=1.00, - rotary_emb_base=10000, - max_position_embeddings=2048, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - bos_token_id=31996, - eos_token_id=31999, - rope_scaling=None, - attention_dropout=0.1, - hidden_dropout=0.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 2560, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + intermediate_multiple_size: Optional[int] = 4, + hidden_act: Optional[str] = "gelu", + rotary_pct: Optional[float] = 1.00, + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 31996, + eos_token_id: Optional[int] = 31999, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_dropout: Optional[float] = 0.1, + hidden_dropout: Optional[float] = 0.0, **kwargs, ): super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) @@ -149,18 +115,21 @@ def __init__( self.hidden_act = hidden_act self.rotary_pct = rotary_pct self.partial_rotary_factor = rotary_pct - self.rotary_emb_base = rotary_emb_base - self.rope_theta = rotary_emb_base self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rotary_emb_base", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index c87da821c958..5120929f9b4b 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -15,6 +15,7 @@ """PyTorch GPTNeoX model.""" import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -67,6 +68,107 @@ def _init_weights(self, module): module.dense_bias.data.zero_() +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->GPTNeoXJapanese +class GPTNeoXJapaneseRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: GPTNeoXJapaneseConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GPTNeoXJapaneseConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + class GPTNeoXJapaneseAttention(nn.Module): def __init__(self, config, use_bias=False, layer_idx=None): super().__init__() @@ -82,8 +184,6 @@ def __init__(self, config, use_bias=False, layer_idx=None): self.layer_idx = layer_idx self.rotary_ndims = int(self.head_size * config.rotary_pct) - self.rope_theta = config.rotary_emb_base - self.rotary_emb = GPTNeoXJapaneseRotaryEmbedding(config=config) self.attention_dropout = nn.Dropout(config.attention_dropout) self.norm_factor = math.sqrt(self.head_size) @@ -102,7 +202,7 @@ def forward( use_cache: Optional[bool] = False, output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ): # Compute QKV # Attention heads [batch, seq_len, hidden_size] @@ -212,78 +312,6 @@ def _attn(self, query, key, value, attention_mask=None): return attn_output, attn_weights -# Copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding with GPTNeoX->GPTNeoXJapanese -class GPTNeoXJapaneseRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: GPTNeoXJapaneseConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2 :] - return torch.cat((-x2, x1), dim=-1) - - -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb -def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): - """Applies Rotary Position Embedding to the query and key tensors. - - Args: - q (`torch.Tensor`): The query tensor. - k (`torch.Tensor`): The key tensor. - cos (`torch.Tensor`): The cosine part of the rotary embedding. - sin (`torch.Tensor`): The sine part of the rotary embedding. - position_ids (`torch.Tensor`, *optional*): - Deprecated and unused. - unsqueeze_dim (`int`, *optional*, defaults to 1): - The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and - sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note - that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and - k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes - cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have - the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. - Returns: - `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. - """ - cos = cos.unsqueeze(unsqueeze_dim) - sin = sin.unsqueeze(unsqueeze_dim) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - def bias_dropout_add(x: Tensor, bias: Tensor, residual: Optional[Tensor], prob: float, training: bool) -> Tensor: """add bias to x, apply dropout and residual connection @@ -343,7 +371,7 @@ def forward( layer_past: Optional[Cache] = None, output_attentions: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ): residual = hidden_states ln_out = self.input_layernorm(hidden_states) @@ -457,9 +485,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index 65e6606a6be9..d7e714079e39 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -14,8 +14,10 @@ # limitations under the License. """openai model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class GptOssConfig(PreTrainedConfig): @@ -47,22 +49,21 @@ class GptOssConfig(PreTrainedConfig): def __init__( self, - num_hidden_layers: int = 36, - num_local_experts: int = 128, - vocab_size: int = 201088, - hidden_size: int = 2880, - intermediate_size: int = 2880, - head_dim: int = 64, - num_attention_heads: int = 64, - num_key_value_heads: int = 8, - sliding_window: int = 128, - rope_theta: float = 150000.0, - tie_word_embeddings=False, - hidden_act: str = "silu", - initializer_range: float = 0.02, - max_position_embeddings=131072, - rms_norm_eps: float = 1e-5, - rope_scaling={ + num_hidden_layers: Optional[int] = 36, + num_local_experts: Optional[int] = 128, + vocab_size: Optional[int] = 201088, + hidden_size: Optional[int] = 2880, + intermediate_size: Optional[int] = 2880, + head_dim: Optional[int] = 64, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = 8, + sliding_window: Optional[int] = 128, + tie_word_embeddings: Optional[bool] = False, + hidden_act: Optional[str] = "silu", + initializer_range: Optional[float] = 0.02, + max_position_embeddings: Optional[int] = 131072, + rms_norm_eps: Optional[float] = 1e-5, + rope_parameters: Optional[RopeParameters] = { "rope_type": "yarn", "factor": 32.0, "beta_fast": 32.0, @@ -70,12 +71,12 @@ def __init__( "truncate": False, "original_max_position_embeddings": 4096, }, - attention_dropout: float = 0.0, - num_experts_per_tok=4, - router_aux_loss_coef: float = 0.9, - output_router_logits=False, - use_cache=True, - layer_types=None, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 4, + router_aux_loss_coef: Optional[float] = 0.9, + output_router_logits: Optional[bool] = False, + use_cache: Optional[bool] = True, + layer_types: Optional[list[str]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -94,8 +95,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.layer_types = layer_types @@ -110,11 +109,13 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.output_router_logits = output_router_logits self.use_cache = use_cache + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 150000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py index 736a95247dfb..a5f3cce78a40 100644 --- a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py +++ b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py @@ -157,10 +157,10 @@ def write_model( original_config = json.loads((Path(input_base_path) / "config.json").read_text()) num_local_experts = original_config.pop("num_experts") - rope_scaling = { + rope_parameters = { "beta_fast": float(original_config.pop("rope_ntk_beta")), "beta_slow": float(original_config.pop("rope_ntk_alpha")), - "factor": float(original_config.pop("rope_scaling_factor")), + "factor": float(original_config.pop("rope_parameters_factor")), "rope_type": "yarn", "truncate": False, "original_max_position_embeddings": 4096, @@ -168,7 +168,7 @@ def write_model( config = GptOssConfig( num_local_experts=num_local_experts, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **original_config, diff --git a/src/transformers/models/gpt_oss/modeling_gpt_oss.py b/src/transformers/models/gpt_oss/modeling_gpt_oss.py index 06c7863d4dd3..92688a0ab341 100644 --- a/src/transformers/models/gpt_oss/modeling_gpt_oss.py +++ b/src/transformers/models/gpt_oss/modeling_gpt_oss.py @@ -176,20 +176,49 @@ class GptOssRotaryEmbedding(nn.Module): def __init__(self, config: GptOssConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GptOssConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -275,6 +304,7 @@ class GptOssAttention(nn.Module): def __init__(self, config: GptOssConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -294,7 +324,7 @@ def __init__(self, config: GptOssConfig, layer_idx: int): self.o_proj = nn.Linear( config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None self.sinks = nn.Parameter(torch.empty(config.num_attention_heads)) def forward( @@ -304,6 +334,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -333,6 +364,7 @@ def forward( dropout=0.0 if not self.training else self.attention_dropout, scaling=self.scaling, sliding_window=self.sliding_window, + position_ids=position_ids, s_aux=self.sinks, # diff with Llama **kwargs, ) @@ -360,7 +392,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -505,11 +537,11 @@ def forward( hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) hidden_states = self.norm(hidden_states) diff --git a/src/transformers/models/gpt_oss/modular_gpt_oss.py b/src/transformers/models/gpt_oss/modular_gpt_oss.py index eaa4ea1af48b..11e21245f41e 100644 --- a/src/transformers/models/gpt_oss/modular_gpt_oss.py +++ b/src/transformers/models/gpt_oss/modular_gpt_oss.py @@ -38,7 +38,6 @@ LlamaDecoderLayer, LlamaPreTrainedModel, LlamaRMSNorm, - LlamaRotaryEmbedding, repeat_kv, ) from ..mixtral.modeling_mixtral import ( @@ -47,7 +46,7 @@ MixtralForTokenClassification, MixtralModel, ) -from ..qwen2.modeling_qwen2 import Qwen2Attention +from ..qwen2.modeling_qwen2 import Qwen2Attention, Qwen2RotaryEmbedding from .configuration_gpt_oss import GptOssConfig @@ -170,7 +169,9 @@ def forward(self, hidden_states): return routed_out, router_scores -class GptOssRotaryEmbedding(LlamaRotaryEmbedding): +class GptOssRotaryEmbedding(Qwen2RotaryEmbedding): + pass + @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): @@ -262,6 +263,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, torch.Tensor]: input_shape = hidden_states.shape[:-1] @@ -291,6 +293,7 @@ def forward( dropout=0.0 if not self.training else self.attention_dropout, scaling=self.scaling, sliding_window=self.sliding_window, + position_ids=position_ids, s_aux=self.sinks, # diff with Llama **kwargs, ) @@ -318,7 +321,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -435,11 +438,11 @@ def forward( hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) hidden_states = self.norm(hidden_states) diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 04c797ecbfd9..65c04c3a67e1 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -19,8 +19,10 @@ # limitations under the License. """Granite model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -76,16 +78,10 @@ class GraniteConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -130,30 +126,29 @@ class GraniteConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - embedding_multiplier=1.0, - logits_scaling=1.0, - residual_multiplier=1.0, - attention_multiplier=1.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + embedding_multiplier: Optional[float] = 1.0, + logits_scaling: Optional[float] = 1.0, + residual_multiplier: Optional[float] = 1.0, + attention_multiplier: Optional[float] = 1.0, **kwargs, ): self.vocab_size = vocab_size @@ -172,8 +167,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias @@ -182,6 +175,14 @@ def __init__( self.logits_scaling = logits_scaling self.residual_multiplier = residual_multiplier self.attention_multiplier = attention_multiplier + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 998c33736563..bf64a382700b 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -144,8 +144,8 @@ def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -242,7 +242,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -323,20 +323,49 @@ class GraniteRotaryEmbedding(nn.Module): def __init__(self, config: GraniteConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GraniteConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -430,9 +459,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/granite/modular_granite.py b/src/transformers/models/granite/modular_granite.py index 9b2203cbff0e..1be9e3600679 100644 --- a/src/transformers/models/granite/modular_granite.py +++ b/src/transformers/models/granite/modular_granite.py @@ -59,7 +59,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -183,9 +183,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index beb4e7d7b0ae..f1263f080630 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -19,8 +19,10 @@ # limitations under the License. """GraniteMoe model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -76,16 +78,10 @@ class GraniteMoeConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -119,33 +115,32 @@ class GraniteMoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - embedding_multiplier=1.0, - logits_scaling=1.0, - residual_multiplier=1.0, - attention_multiplier=1.0, - num_local_experts=8, - num_experts_per_tok=2, - output_router_logits=False, - router_aux_loss_coef=0.001, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + embedding_multiplier: Optional[float] = 1.0, + logits_scaling: Optional[float] = 1.0, + residual_multiplier: Optional[float] = 1.0, + attention_multiplier: Optional[float] = 1.0, + num_local_experts: Optional[int] = 8, + num_experts_per_tok: Optional[int] = 2, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, **kwargs, ): self.vocab_size = vocab_size @@ -164,8 +159,14 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index 7a26687dc539..0eefadc9a1b9 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -67,20 +67,49 @@ class GraniteMoeRotaryEmbedding(nn.Module): def __init__(self, config: GraniteMoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GraniteMoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -336,8 +365,8 @@ def __init__(self, config: GraniteMoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -394,7 +423,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py index 94a88380590b..3c5b73ebf899 100644 --- a/src/transformers/models/granitemoe/modular_granitemoe.py +++ b/src/transformers/models/granitemoe/modular_granitemoe.py @@ -114,7 +114,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index eed40cd6f012..55e1546fa435 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -15,7 +15,10 @@ # limitations under the License. """GraniteMoeHybrid model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -70,16 +73,10 @@ class GraniteMoeHybridConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -135,44 +132,43 @@ class GraniteMoeHybridConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - embedding_multiplier=1.0, - logits_scaling=1.0, - residual_multiplier=1.0, - attention_multiplier=1.0, - num_local_experts=8, - num_experts_per_tok=2, - output_router_logits=False, - router_aux_loss_coef=0.001, - shared_intermediate_size=1024, - layer_types=None, - mamba_n_heads=128, - mamba_n_groups=1, - mamba_d_state=256, - mamba_d_head="auto", - mamba_d_conv=4, - mamba_expand=2, - mamba_chunk_size=256, - mamba_conv_bias=True, - mamba_proj_bias=False, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + embedding_multiplier: Optional[float] = 1.0, + logits_scaling: Optional[float] = 1.0, + residual_multiplier: Optional[float] = 1.0, + attention_multiplier: Optional[float] = 1.0, + num_local_experts: Optional[int] = 8, + num_experts_per_tok: Optional[int] = 2, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + shared_intermediate_size: Optional[int] = 1024, + layer_types: Optional[list[str]] = None, + mamba_n_heads: Optional[int] = 128, + mamba_n_groups: Optional[int] = 1, + mamba_d_state: Optional[int] = 256, + mamba_d_head: Optional[str] = "auto", + mamba_d_conv: Optional[int] = 4, + mamba_expand: Optional[int] = 2, + mamba_chunk_size: Optional[int] = 256, + mamba_conv_bias: Optional[bool] = True, + mamba_proj_bias: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -191,8 +187,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.embedding_multiplier = embedding_multiplier self.logits_scaling = logits_scaling @@ -204,6 +198,14 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 7f7667c9df78..0b296f26ac22 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -860,6 +860,71 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states +class GraniteMoeHybridRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: GraniteMoeHybridConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GraniteMoeHybridConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + class GraniteFlashAttentionKwargs(TypedDict, total=False): """ Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage. @@ -1078,6 +1143,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -1098,6 +1164,7 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) @@ -1146,42 +1213,6 @@ def _init_weights(self, module): module.weight.data.fill_(1.0) -class GraniteMoeHybridRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: GraniteMoeHybridConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class GraniteMoeHybridModel(GraniteMoeHybridPreTrainedModel): def __init__(self, config: GraniteMoeHybridConfig): @@ -1207,6 +1238,7 @@ def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, @@ -1227,6 +1259,9 @@ def forward( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + causal_mask = create_causal_mask( self.config, inputs_embeds, @@ -1238,6 +1273,8 @@ def forward( # embed positions hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for decoder_layer in self.layers: # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask @@ -1248,6 +1285,7 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = self.norm(hidden_states) diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py index 9abf7dc30829..f1b8a5bfb110 100644 --- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py @@ -28,6 +28,7 @@ from ...utils.generic import check_model_inputs from ..bamba.configuration_bamba import BambaConfig from ..bamba.modeling_bamba import BambaMixer, BambaRMSNormGated, HybridMambaAttentionDynamicCache +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..granitemoeshared.modeling_granitemoeshared import ( GraniteFlashAttentionKwargs, GraniteMoeSharedAttention, @@ -102,6 +103,10 @@ def __init__(self, config: GraniteMoeHybridConfig): super().__init__(config) +class GraniteMoeHybridRotaryEmbedding(Gemma2RotaryEmbedding): + pass + + class GraniteMoeHybridDecoderLayer(GraniteMoeSharedDecoderLayer): def __init__(self, config: GraniteMoeHybridConfig, layer_idx: int): super().__init__(config, layer_idx) @@ -127,6 +132,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[GraniteFlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -147,6 +153,7 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) @@ -193,6 +200,7 @@ def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, inputs_embeds: Optional[torch.FloatTensor] = None, use_cache: Optional[bool] = None, @@ -213,6 +221,9 @@ def forward( past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + causal_mask = create_causal_mask( self.config, inputs_embeds, @@ -224,6 +235,8 @@ def forward( # embed positions hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for decoder_layer in self.layers: # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask @@ -234,6 +247,7 @@ def forward( past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) hidden_states = self.norm(hidden_states) diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index e75688a6f45e..00f87604bf51 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -19,8 +19,10 @@ # limitations under the License. """GraniteMoeShared model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -76,16 +78,10 @@ class GraniteMoeSharedConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -121,34 +117,33 @@ class GraniteMoeSharedConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - embedding_multiplier=1.0, - logits_scaling=1.0, - residual_multiplier=1.0, - attention_multiplier=1.0, - num_local_experts=8, - num_experts_per_tok=2, - output_router_logits=False, - router_aux_loss_coef=0.001, - shared_intermediate_size=0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + embedding_multiplier: Optional[float] = 1.0, + logits_scaling: Optional[float] = 1.0, + residual_multiplier: Optional[float] = 1.0, + attention_multiplier: Optional[float] = 1.0, + num_local_experts: Optional[int] = 8, + num_experts_per_tok: Optional[int] = 2, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + shared_intermediate_size: Optional[int] = 0, **kwargs, ): self.vocab_size = vocab_size @@ -167,10 +162,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling # this model has rope embedding type, hardcoded for BC self.position_embedding_type = "rope" + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index ddebc8af8862..8b1569722006 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -355,8 +355,8 @@ def __init__(self, config: GraniteMoeSharedConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -479,20 +479,49 @@ class GraniteMoeSharedRotaryEmbedding(nn.Module): def __init__(self, config: GraniteMoeSharedConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[GraniteMoeSharedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index a76a553e1425..db7ccaf185ae 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -14,7 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class HeliumConfig(PreTrainedConfig): @@ -63,8 +66,10 @@ class HeliumConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 100000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. pad_token_id (`int`, *optional*, defaults to 3): Padding token id. eos_token_id (`int` | `list`, *optional*, defaults to 2): @@ -105,26 +110,26 @@ class HeliumConfig(PreTrainedConfig): def __init__( self, - vocab_size=48000, - hidden_size=2560, - intermediate_size=7040, - num_hidden_layers=24, - num_attention_heads=20, - num_key_value_heads=20, - head_dim=128, - hidden_act="silu", - attention_dropout=0.0, - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-8, - use_cache=True, - tie_word_embeddings=False, - rope_theta=100000.0, - pad_token_id=3, - eos_token_id=2, - bos_token_id=1, - attention_bias=False, - mlp_bias=False, + vocab_size: Optional[int] = 48000, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 7040, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 20, + num_key_value_heads: Optional[int] = 20, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + attention_dropout: Optional[float] = 0.0, + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-8, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + pad_token_id: Optional[int] = 3, + eos_token_id: Optional[int] = 2, + bos_token_id: Optional[int] = 1, + attention_bias: Optional[bool] = False, + mlp_bias: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -139,10 +144,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 100000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/helium/modeling_helium.py b/src/transformers/models/helium/modeling_helium.py index 5e1aa247b65d..a1d0a09e848f 100644 --- a/src/transformers/models/helium/modeling_helium.py +++ b/src/transformers/models/helium/modeling_helium.py @@ -66,20 +66,49 @@ class HeliumRotaryEmbedding(nn.Module): def __init__(self, config: HeliumConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[HeliumConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -218,8 +247,8 @@ def __init__(self, config: HeliumConfig, layer_idx: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -278,7 +307,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -335,7 +364,7 @@ def __init__(self, config: HeliumConfig): [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = HeliumRotaryEmbedding(config) + self.rotary_emb = HeliumRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -382,16 +411,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/helium/modular_helium.py b/src/transformers/models/helium/modular_helium.py index 6c2538d438f9..79d995720e30 100644 --- a/src/transformers/models/helium/modular_helium.py +++ b/src/transformers/models/helium/modular_helium.py @@ -121,7 +121,6 @@ def __init__(self, config: HeliumConfig): [HeliumDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = HeliumRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = HeliumRotaryEmbedding(config) self.gradient_checkpointing = False # Initialize weights and apply final processing diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 083fb8e3c40d..29dd3ac34f98 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -14,8 +14,11 @@ # limitations under the License. """HunYuanDenseV1 model configuration""" -from transformers.configuration_utils import PreTrainedConfig -from transformers.utils import logging +from typing import Optional + +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...utils import logging logger = logging.get_logger(__name__) @@ -79,16 +82,10 @@ class HunYuanDenseV1Config(PreTrainedConfig): issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -102,28 +99,27 @@ class HunYuanDenseV1Config(PreTrainedConfig): def __init__( self, - vocab_size=290943, - hidden_size=4096, - intermediate_size: int = 11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - eod_token_id=3, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - head_dim=None, + vocab_size: Optional[int] = 290943, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + eod_token_id: Optional[int] = 3, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -143,11 +139,16 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - # self._rope_scaling_validation() # TODO: Need validation? self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) # TODO needs model-specific validation? super().__init__( pad_token_id=pad_token_id, @@ -157,33 +158,35 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: raise ValueError( - "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " - f"got {self.rope_scaling}" + "`rope_parameters` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " + f"got {self.rope_parameters}" ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - rope_scaling_alpha = self.rope_scaling.get("alpha", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + rope_parameters_type = self.rope_parameters.get("type", None) + rope_parameters_factor = self.rope_parameters.get("factor", None) + rope_parameters_alpha = self.rope_parameters.get("alpha", None) + if rope_parameters_type is None or rope_parameters_type not in ["linear", "dynamic"]: raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + f"`rope_parameters`'s type field must be one of ['linear', 'dynamic'], got {rope_parameters_type}" ) - if rope_scaling_factor is None and rope_scaling_alpha is None: - raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none") - if rope_scaling_factor is not None: - if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}") - if rope_scaling_alpha is not None: - if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0: - raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}") + if rope_parameters_factor is None and rope_parameters_alpha is None: + raise ValueError("`rope_parameters`'s factor or alpha field must be have one, got both of none") + if rope_parameters_factor is not None: + if not isinstance(rope_parameters_factor, float) or rope_parameters_factor <= 1.0: + raise ValueError( + f"`rope_parameters`'s factor field must be a float > 1.0, got {rope_parameters_factor}" + ) + if rope_parameters_alpha is not None: + if not isinstance(rope_parameters_alpha, float) or rope_parameters_alpha <= 1.0: + raise ValueError(f"`rope_parameters`'s alpha field must be a float > 1.0, got {rope_parameters_alpha}") __all__ = ["HunYuanDenseV1Config"] diff --git a/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py index 8ebbe2e13c7a..e3a55c296f6f 100644 --- a/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.py @@ -246,7 +246,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -307,27 +307,59 @@ class HunYuanDenseV1RotaryEmbedding(nn.Module): def __init__(self, config: HunYuanDenseV1Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: - # DynamicNTKAlphaRotary + + self.rope_type = self.config.rope_parameters["rope_type"] + + # Diff from Llama - DynamicNTKAlphaRotary + if self.rope_type == "dynamic" and self.config.rope_parameters.get("alpha"): self.dim = config.head_dim - base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + base = self.config.rope_parameters["rope_theta"] * self.config.rope_parameters["alpha"] ** ( + self.config.head_dim / (self.config.head_dim - 2) + ) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.config.head_dim)) self.attention_scaling = 1.0 else: - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[HunYuanDenseV1Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -404,16 +436,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py index 1a2669b49de2..31a03ac05cc7 100644 --- a/src/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/modular_hunyuan_v1_dense.py @@ -25,7 +25,7 @@ logging, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs @@ -38,6 +38,7 @@ LlamaModel, LlamaPreTrainedModel, LlamaRMSNorm, + LlamaRotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) @@ -131,47 +132,32 @@ def _init_weights(self, module): module.weight.data[module.padding_idx].zero_() -class HunYuanDenseV1RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - +class HunYuanDenseV1RotaryEmbedding(LlamaRotaryEmbedding): def __init__(self, config: HunYuanDenseV1Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" + nn.Module.__init__() self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: - # DynamicNTKAlphaRotary + + self.rope_type = self.config.rope_parameters["rope_type"] + + # Diff from Llama - DynamicNTKAlphaRotary + if self.rope_type == "dynamic" and self.config.rope_parameters.get("alpha"): self.dim = config.head_dim - base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + base = self.config.rope_parameters["rope_theta"] * self.config.rope_parameters["alpha"] ** ( + self.config.head_dim / (self.config.head_dim - 2) + ) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.config.head_dim)) self.attention_scaling = 1.0 else: - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + self.original_inv_freq = inv_freq class HunYuanDenseV1Model(LlamaModel): diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 431e19861ab7..497a5674f4f3 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -14,10 +14,11 @@ # limitations under the License. """HunYuanMoEV1 model configuration""" -from typing import Union +from typing import Optional, Union -from transformers.configuration_utils import PreTrainedConfig -from transformers.utils import logging +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...utils import logging logger = logging.get_logger(__name__) @@ -83,16 +84,10 @@ class HunYuanMoEV1Config(PreTrainedConfig): issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -114,31 +109,30 @@ class HunYuanMoEV1Config(PreTrainedConfig): def __init__( self, - vocab_size=290943, - hidden_size=4096, - intermediate_size: int = 11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - eod_token_id=3, - sep_token_id=4, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 290943, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + eod_token_id: Optional[int] = 3, + sep_token_id: Optional[int] = 4, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, num_experts: Union[int, list] = 1, moe_topk: Union[int, list] = 1, - head_dim=None, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -161,11 +155,16 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - # self._rope_scaling_validation() # TODO: Need validation? self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, @@ -176,33 +175,35 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: + if self.rope_parameters is None: return - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: raise ValueError( - "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " - f"got {self.rope_scaling}" + "`rope_parameters` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " + f"got {self.rope_parameters}" ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - rope_scaling_alpha = self.rope_scaling.get("alpha", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + rope_parameters_type = self.rope_parameters.get("type", None) + rope_parameters_factor = self.rope_parameters.get("factor", None) + rope_parameters_alpha = self.rope_parameters.get("alpha", None) + if rope_parameters_type is None or rope_parameters_type not in ["linear", "dynamic"]: raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + f"`rope_parameters`'s type field must be one of ['linear', 'dynamic'], got {rope_parameters_type}" ) - if rope_scaling_factor is None and rope_scaling_alpha is None: - raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none") - if rope_scaling_factor is not None: - if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}") - if rope_scaling_alpha is not None: - if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0: - raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}") + if rope_parameters_factor is None and rope_parameters_alpha is None: + raise ValueError("`rope_parameters`'s factor or alpha field must be have one, got both of none") + if rope_parameters_factor is not None: + if not isinstance(rope_parameters_factor, float) or rope_parameters_factor <= 1.0: + raise ValueError( + f"`rope_parameters`'s factor field must be a float > 1.0, got {rope_parameters_factor}" + ) + if rope_parameters_alpha is not None: + if not isinstance(rope_parameters_alpha, float) or rope_parameters_alpha <= 1.0: + raise ValueError(f"`rope_parameters`'s alpha field must be a float > 1.0, got {rope_parameters_alpha}") __all__ = ["HunYuanMoEV1Config"] diff --git a/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py index 0c655f15ace5..a1fded6bdf77 100644 --- a/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py @@ -325,7 +325,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -385,27 +385,59 @@ class HunYuanMoEV1RotaryEmbedding(nn.Module): def __init__(self, config: HunYuanMoEV1Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: - # DynamicNTKAlphaRotary + + self.rope_type = self.config.rope_parameters["rope_type"] + + # Diff from Llama - DynamicNTKAlphaRotary + if self.rope_type == "dynamic" and self.config.rope_parameters.get("alpha"): self.dim = config.head_dim - base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + base = self.config.rope_parameters["rope_theta"] * self.config.rope_parameters["alpha"] ** ( + self.config.head_dim / (self.config.head_dim - 2) + ) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.config.head_dim)) self.attention_scaling = 1.0 else: - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[HunYuanMoEV1Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -482,16 +514,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py index 7c0573e81ab1..06269fedf784 100644 --- a/src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py @@ -26,10 +26,10 @@ logging, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs +from ..hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1RotaryEmbedding from ..llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -189,47 +189,8 @@ def _init_weights(self, module): module.weight.data[module.padding_idx].zero_() -class HunYuanMoEV1RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: HunYuanMoEV1Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - if self.rope_type == "dynamic" and config.rope_scaling["alpha"]: - # DynamicNTKAlphaRotary - self.dim = config.head_dim - base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2)) - inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.attention_scaling = 1.0 - else: - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) +class HunYuanMoEV1RotaryEmbedding(HunYuanDenseV1RotaryEmbedding): + pass class HunYuanMoEV1Model(LlamaModel): diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index bc466a8f44ae..9f5367d1c01c 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -14,7 +14,10 @@ # limitations under the License. """JetMoe model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -70,8 +73,10 @@ class JetMoeConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. rms_norm_eps (`float`, *optional*, defaults to 1e-06): The epsilon used by the rms normalization layers. initializer_range (`float`, *optional*, defaults to 0.01): @@ -98,26 +103,26 @@ class JetMoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=2048, - num_hidden_layers=12, - num_key_value_heads=16, - kv_channels=128, - intermediate_size=5632, - max_position_embeddings=4096, - activation_function="silu", - num_local_experts=8, - num_experts_per_tok=2, - output_router_logits=False, - aux_loss_coef=0.01, - use_cache=True, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - rms_norm_eps=1e-6, - initializer_range=0.01, - attention_dropout=0.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 12, + num_key_value_heads: Optional[int] = 16, + kv_channels: Optional[int] = 128, + intermediate_size: Optional[int] = 5632, + max_position_embeddings: Optional[int] = 4096, + activation_function: Optional[str] = "silu", + num_local_experts: Optional[int] = 8, + num_experts_per_tok: Optional[int] = 2, + output_router_logits: Optional[bool] = False, + aux_loss_coef: Optional[float] = 0.01, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rms_norm_eps: Optional[int] = 1e-6, + initializer_range: Optional[float] = 0.01, + attention_dropout: Optional[float] = 0.0, **kwargs, ): if num_experts_per_tok > num_local_experts: @@ -141,9 +146,15 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - - self.rope_theta = rope_theta self.rms_norm_eps = rms_norm_eps + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/jetmoe/modeling_jetmoe.py b/src/transformers/models/jetmoe/modeling_jetmoe.py index 588a2e21ea39..1beb7be7626c 100644 --- a/src/transformers/models/jetmoe/modeling_jetmoe.py +++ b/src/transformers/models/jetmoe/modeling_jetmoe.py @@ -70,20 +70,49 @@ class JetMoeRotaryEmbedding(nn.Module): def __init__(self, config: JetMoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[JetMoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -509,7 +538,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/jetmoe/modular_jetmoe.py b/src/transformers/models/jetmoe/modular_jetmoe.py index 3a7bff3d9659..d994388969e3 100644 --- a/src/transformers/models/jetmoe/modular_jetmoe.py +++ b/src/transformers/models/jetmoe/modular_jetmoe.py @@ -394,7 +394,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 6618b8573aad..d43856daa96a 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -13,7 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License.s +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -56,8 +59,10 @@ class KyutaiSpeechToTextConfig(PreTrainedConfig): max_position_embeddings (`int`, *optional*, defaults to 750): The maximum sequence length that this model might ever be used with. Typically, set this to something large just in case (e.g., 512 or 1024 or 2048). - rope_theta (`float`, *optional*, defaults to 100000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): @@ -117,29 +122,29 @@ class KyutaiSpeechToTextConfig(PreTrainedConfig): def __init__( self, - codebook_vocab_size=2049, - vocab_size=4001, - hidden_size=2048, - num_hidden_layers=48, - num_attention_heads=32, - num_key_value_heads=None, - max_position_embeddings=750, - rope_theta=100000.0, - hidden_act="silu", - head_dim=None, - initializer_range=0.02, - use_cache=True, - sliding_window=375, - attention_dropout=0.0, - ffn_dim=11264, - rms_norm_eps=1e-8, - num_codebooks=32, - audio_bos_token_id=2048, - audio_pad_token_id=69569, - tie_word_embeddings=False, - pad_token_id=3, - bos_token_id=48000, - codec_config=None, + codebook_vocab_size: Optional[int] = 2049, + vocab_size: Optional[int] = 4001, + hidden_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 48, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + max_position_embeddings: Optional[int] = 750, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + hidden_act: Optional[str] = "silu", + head_dim: Optional[int] = None, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + sliding_window: Optional[int] = 375, + attention_dropout: Optional[float] = 0.0, + ffn_dim: Optional[int] = 11264, + rms_norm_eps: Optional[int] = 1e-8, + num_codebooks: Optional[int] = 32, + audio_bos_token_id: Optional[int] = 2048, + audio_pad_token_id: Optional[int] = 69569, + tie_word_embeddings: Optional[bool] = False, + pad_token_id: Optional[int] = 3, + bos_token_id: Optional[int] = 48000, + codec_config: Optional[dict] = None, **kwargs, ): if codec_config is None: @@ -175,10 +180,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.sliding_window = sliding_window + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py index c68d6835e958..f5289763ff39 100644 --- a/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py @@ -21,6 +21,7 @@ import math import types +from collections.abc import Callable from typing import Optional, Union import torch @@ -271,20 +272,49 @@ class KyutaiSpeechToTextRotaryEmbedding(nn.Module): def __init__(self, config: KyutaiSpeechToTextConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[KyutaiSpeechToTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -425,7 +455,6 @@ def __init__( # rotary embeddings are not used in the depth decoder self.rotary_emb = None if use_rope: - self.rope_theta = config.rope_theta self.rotary_emb = KyutaiSpeechToTextRotaryEmbedding(config) def forward( diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 3b75a640fc4c..4999f6ab433f 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,6 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Lfm2Config(PreTrainedConfig): @@ -65,8 +66,10 @@ class Lfm2Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. conv_bias (`bool`, *optional*, defaults to `False`): Whether to use bias in the conv layers. conv_L_cache (`int`, *optional*, defaults to 3): @@ -100,26 +103,26 @@ class Lfm2Config(PreTrainedConfig): def __init__( self, - vocab_size: int = 65536, - hidden_size: int = 2560, - intermediate_size: int = 12288, - num_hidden_layers: int = 32, - num_attention_heads: int = 32, - num_key_value_heads: int = 8, - max_position_embeddings: int = 128_000, - initializer_range: float = 0.02, - norm_eps: float = 0.00001, - use_cache: bool = True, - pad_token_id: int = 0, - bos_token_id: int = 1, - eos_token_id: int = 2, - tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, - conv_bias: bool = False, - conv_L_cache: int = 3, - block_multiple_of: int = 256, - block_ffn_dim_multiplier: float = 1.0, - block_auto_adjust_ff_dim: bool = True, + vocab_size: Optional[int] = 65536, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 12288, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + max_position_embeddings: Optional[int] = 128_000, + initializer_range: Optional[float] = 0.02, + norm_eps: Optional[float] = 0.00001, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + conv_bias: Optional[bool] = False, + conv_L_cache: Optional[int] = 3, + block_multiple_of: Optional[int] = 256, + block_ffn_dim_multiplier: Optional[float] = 1.0, + block_auto_adjust_ff_dim: Optional[bool] = True, full_attn_idxs: Optional[list[int]] = None, layer_types: Optional[list[str]] = None, **kwargs, @@ -127,7 +130,6 @@ def __init__( self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = kwargs.get("theta", rope_theta) # to fit original config keys self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -146,12 +148,20 @@ def __init__( self.block_multiple_of = block_multiple_of self.block_ffn_dim_multiplier = block_ffn_dim_multiplier self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: full_attn_idxs = full_attn_idxs if full_attn_idxs is not None else list(range(num_hidden_layers)) self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/lfm2/modeling_lfm2.py b/src/transformers/models/lfm2/modeling_lfm2.py index 0d2cc30646b5..737e0f53255d 100644 --- a/src/transformers/models/lfm2/modeling_lfm2.py +++ b/src/transformers/models/lfm2/modeling_lfm2.py @@ -71,20 +71,49 @@ class Lfm2RotaryEmbedding(nn.Module): def __init__(self, config: Lfm2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Lfm2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -353,6 +382,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Lfm2HybridConvCache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -526,7 +556,7 @@ def __init__(self, config: Lfm2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Lfm2HybridConvCache] = None, @@ -586,8 +616,8 @@ def __init__(self, config: Lfm2Config): self.layers = nn.ModuleList( [Lfm2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) + self.rotary_emb = Lfm2RotaryEmbedding(config=config) self.gradient_checkpointing = False - self.pos_emb = Lfm2RotaryEmbedding(config) self.embedding_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps) # Initialize weights and apply final processing @@ -637,17 +667,17 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.pos_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/lfm2/modular_lfm2.py b/src/transformers/models/lfm2/modular_lfm2.py index 5bbbf12b286f..778a5c505dbd 100644 --- a/src/transformers/models/lfm2/modular_lfm2.py +++ b/src/transformers/models/lfm2/modular_lfm2.py @@ -26,13 +26,13 @@ from ...utils import TransformersKwargs, logging from ...utils.import_utils import is_causal_conv1d_available from ..bamba.modeling_bamba import apply_mask_to_padding_states +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..llama.modeling_llama import ( LlamaAttention, LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel, LlamaRMSNorm, - LlamaRotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) @@ -56,7 +56,7 @@ class Lfm2RMSNorm(LlamaRMSNorm): pass -class Lfm2RotaryEmbedding(LlamaRotaryEmbedding): +class Lfm2RotaryEmbedding(Gemma2RotaryEmbedding): pass @@ -233,6 +233,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Lfm2HybridConvCache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -391,7 +392,7 @@ def __init__(self, config: Lfm2Config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Lfm2HybridConvCache] = None, @@ -429,10 +430,8 @@ class Lfm2PreTrainedModel(LlamaPreTrainedModel): class Lfm2Model(LlamaModel): def __init__(self, config: Lfm2Config): super().__init__(config) - self.pos_emb = Lfm2RotaryEmbedding(config) self.embedding_norm = Lfm2RMSNorm(config.hidden_size, eps=config.norm_eps) del self.norm - del self.rotary_emb def forward( self, @@ -476,17 +475,17 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.pos_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index 550954ecfd20..f65af16d77b6 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -13,18 +13,19 @@ # limitations under the License. from typing import Optional -from ...configuration_utils import PretrainedConfig +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params -class Lfm2MoeConfig(PretrainedConfig): +class Lfm2MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of the LFM2-8B-A1B model. e.g. [LiquidAI/LFM2-8B-A1B](https://huggingface.co/LiquidAI/LFM2-8B-A1B) - Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PretrainedConfig`] for more information. + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. Args: @@ -47,8 +48,10 @@ class Lfm2MoeConfig(PretrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. max_position_embeddings (`int`, *optional*, defaults to 128000): The maximum sequence length that this model might ever be used with. use_cache (`bool`, *optional*, defaults to `True`): @@ -112,7 +115,7 @@ def __init__( bos_token_id: int = 1, eos_token_id: int = 2, tie_word_embeddings: bool = True, - rope_theta: float = 1000000.0, + rope_parameters: RopeParameters = None, max_position_embeddings: int = 128_000, use_cache: bool = True, norm_eps: float = 0.00001, @@ -133,7 +136,9 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - self.rope_theta = rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -156,6 +161,11 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.layer_types = layer_types + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py index 71b6a209db06..05f2f5389322 100644 --- a/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/modeling_lfm2_moe.py @@ -71,20 +71,49 @@ class Lfm2MoeRotaryEmbedding(nn.Module): def __init__(self, config: Lfm2MoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Lfm2MoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -416,6 +445,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Lfm2MoeHybridConvCache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -593,7 +623,7 @@ def __init__(self, config: Lfm2MoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Lfm2MoeHybridConvCache] = None, @@ -704,7 +734,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.pos_emb(hidden_states, position_ids) + position_embeddings = self.pos_emb(hidden_states, position_ids=position_ids) # decoder layers for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/lfm2_moe/modular_lfm2_moe.py b/src/transformers/models/lfm2_moe/modular_lfm2_moe.py index 9a4f5ff73c84..5d5b1b08d8ea 100644 --- a/src/transformers/models/lfm2_moe/modular_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/modular_lfm2_moe.py @@ -21,8 +21,15 @@ from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.import_utils import is_causal_conv1d_available -from ..lfm2.modeling_lfm2 import Lfm2Attention, Lfm2DecoderLayer, Lfm2HybridConvCache, Lfm2MLP, Lfm2ShortConv -from ..llama.modeling_llama import LlamaForCausalLM, LlamaPreTrainedModel, LlamaRMSNorm, LlamaRotaryEmbedding +from ..lfm2.modeling_lfm2 import ( + Lfm2Attention, + Lfm2DecoderLayer, + Lfm2HybridConvCache, + Lfm2MLP, + Lfm2RotaryEmbedding, + Lfm2ShortConv, +) +from ..llama.modeling_llama import LlamaForCausalLM, LlamaPreTrainedModel, LlamaRMSNorm from ..mixtral.modeling_mixtral import MixtralModel from ..qwen2_moe.modeling_qwen2_moe import Qwen2MoeExperts from .configuration_lfm2_moe import Lfm2MoeConfig @@ -45,7 +52,7 @@ class Lfm2MoeRMSNorm(LlamaRMSNorm): pass -class Lfm2MoeRotaryEmbedding(LlamaRotaryEmbedding): +class Lfm2MoeRotaryEmbedding(Lfm2RotaryEmbedding): pass @@ -175,7 +182,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.pos_emb(hidden_states, position_ids) + position_embeddings = self.pos_emb(hidden_states, position_ids=position_ids) # decoder layers for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/lightglue/modular_lightglue.py b/src/transformers/models/lightglue/modular_lightglue.py index e37469160c1d..cb6a69f0625c 100644 --- a/src/transformers/models/lightglue/modular_lightglue.py +++ b/src/transformers/models/lightglue/modular_lightglue.py @@ -363,6 +363,10 @@ def forward( class LightGlueAttention(LlamaAttention): + def __init__(self, config: LightGlueConfig, layer_idx: int): + super().__init__() + del self.rotary_emb + def forward( self, hidden_states: torch.Tensor, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 5501785d46da..3b2543983e06 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -19,8 +19,10 @@ # limitations under the License. """LLaMA model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class LlamaConfig(PreTrainedConfig): @@ -79,45 +81,10 @@ class LlamaConfig(PreTrainedConfig): results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -160,28 +127,27 @@ class LlamaConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - head_dim=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -201,16 +167,17 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py index e63770a154de..191426910ec1 100644 --- a/src/transformers/models/llama/convert_llama_weights_to_hf.py +++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py @@ -375,7 +375,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): eos_token_id = 2 if llama_version in ["3.1", "3.2", "Guard-3"]: - rope_scaling = { + rope_parameters = { "factor": 32.0 if llama_version == "3.2" else 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, @@ -383,7 +383,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): "rope_type": "llama3", } else: - rope_scaling = None + rope_parameters = None config = LlamaConfig( hidden_size=dim, @@ -394,7 +394,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): num_key_value_heads=num_key_value_heads, vocab_size=vocab_size, rope_theta=base, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 60a181667282..3d8340091bee 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -75,20 +75,49 @@ class LlamaRotaryEmbedding(nn.Module): def __init__(self, config: LlamaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[LlamaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -223,8 +252,8 @@ def __init__(self, config: LlamaConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -283,7 +312,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -387,16 +416,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 214a90a1bc54..7d457cf8523c 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -15,8 +15,10 @@ # limitations under the License. import warnings +from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -66,7 +68,8 @@ class Llama4VisionConfig(PreTrainedConfig): multi_modal_projector_bias (`int`, *optional*, defaults to `False`): TODO projector_dropout (`int`, *optional*, defaults to 0.0): TODO attention_dropout (`int`, *optional*, defaults to 0.0): TODO - rope_theta (`int`, *optional*, defaults to 10000): TODO + rope_parameters (`RopeParameters`, *optional*): + RoPE Parameters """ base_model_tp_plan = { @@ -83,25 +86,25 @@ class Llama4VisionConfig(PreTrainedConfig): def __init__( self, - hidden_size: int = 768, - hidden_act: str = "gelu", - num_hidden_layers: int = 34, - num_attention_heads: int = 16, - num_channels: int = 3, - intermediate_size: int = 5632, - vision_output_dim: int = 7680, - image_size: int = 448, - patch_size: int = 14, - norm_eps: float = 1e-5, - vision_feature_select_strategy="default", - initializer_range: float = 0.02, - pixel_shuffle_ratio=0.5, - projector_input_dim=4096, - projector_output_dim=4096, - multi_modal_projector_bias=False, - projector_dropout=0.0, - attention_dropout=0.0, - rope_theta=10000, + hidden_size: Optional[int] = 768, + hidden_act: Optional[str] = "gelu", + num_hidden_layers: Optional[int] = 34, + num_attention_heads: Optional[int] = 16, + num_channels: Optional[int] = 3, + intermediate_size: Optional[int] = 5632, + vision_output_dim: Optional[int] = 7680, + image_size: Optional[int] = 448, + patch_size: Optional[int] = 14, + norm_eps: Optional[float] = 1e-5, + vision_feature_select_strategy: Optional[str] = "default", + initializer_range: Optional[float] = 0.02, + pixel_shuffle_ratio: Optional[float] = 0.5, + projector_input_dim: Optional[int] = 4096, + projector_output_dim: Optional[int] = 4096, + multi_modal_projector_bias: Optional[bool] = False, + projector_dropout: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): self.hidden_size = hidden_size @@ -122,9 +125,14 @@ def __init__( self.projector_dropout = projector_dropout self.attention_dropout = attention_dropout self.vision_feature_select_strategy = vision_feature_select_strategy - self.rope_theta = rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - self._vision_feature_layer = kwargs.get("vision_feature_layer", -1) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) @property def vision_feature_layer(self): @@ -187,8 +195,6 @@ class Llama4TextConfig(PreTrainedConfig): The id of the end of sentence token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to `500000.0`): - The base period of the RoPE embeddings. attention_dropout (`int`, *optional*, defaults to 0.0): TODO num_experts_per_tok (`int`, *optional*, defaults to 1): TODO num_local_experts (`int`, *optional*, defaults to 16): TODO @@ -198,43 +204,10 @@ class Llama4TextConfig(PreTrainedConfig): output_router_logits (`int`, *optional*, defaults to `False`): TODO router_aux_loss_coef (`int`, *optional*, defaults to 0.001): TODO router_jitter_noise (`int`, *optional*, defaults to 0.0): TODO - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. no_rope_layers (`list[int]`, *optional*): @@ -308,7 +281,6 @@ def __init__( bos_token_id=1, eos_token_id=2, tie_word_embeddings=False, - rope_theta=500000, attention_dropout=0.0, num_experts_per_tok=1, num_local_experts=16, @@ -318,7 +290,7 @@ def __init__( output_router_logits=False, router_aux_loss_coef=0.001, router_jitter_noise=0.0, - rope_scaling=None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, no_rope_layers=None, no_rope_layer_interval=4, attention_chunk_size=8192, @@ -345,7 +317,6 @@ def __init__( self.intermediate_size_mlp = intermediate_size_mlp self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.rope_scaling = rope_scaling self.attention_bias = False # for backward compatibility if num_key_value_heads is None: @@ -356,10 +327,12 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.use_qk_norm = use_qk_norm + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts @@ -393,6 +366,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class Llama4Config(PreTrainedConfig): r""" diff --git a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py index 2363a77b1031..1cba9c678584 100644 --- a/src/transformers/models/llama4/convert_llama4_weights_to_hf.py +++ b/src/transformers/models/llama4/convert_llama4_weights_to_hf.py @@ -243,14 +243,14 @@ def write_model( config_kwargs = {} if params["use_scaled_rope"]: # some constants from original code - rope_scaling = { + rope_parameters = { "rope_type": "llama3", - "factor": params.get("rope_scaling_factor", 8.0), + "factor": params.get("rope_parameters_factor", 8.0), "low_freq_factor": 1.0, "high_freq_factor": params.get("rope_high_freq_factor", 4.0), "original_max_position_embeddings": 8192, } - config_kwargs.update({"rope_scaling": rope_scaling}) + config_kwargs.update({"rope_parameters": rope_parameters}) if attention_chunk_size is None: config_kwargs.update({"cache_implementation": "static"}) diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py index c856288cb651..5522582f9592 100644 --- a/src/transformers/models/llama4/modeling_llama4.py +++ b/src/transformers/models/llama4/modeling_llama4.py @@ -32,7 +32,10 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -166,24 +169,58 @@ def forward(self, hidden_states): return out, router_logits +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Llama4Text class Llama4TextRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` + # Ignore copy def __init__(self, config: Llama4TextConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - self.rope_type = "llama3" if config.rope_scaling is not None else "default" - self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Llama4TextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + # Ignore copy @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): @@ -394,7 +431,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index e9703a6b55c2..7933cb5bb0dc 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -15,8 +15,10 @@ """LongCat Flash model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class LongcatFlashConfig(PreTrainedConfig): @@ -69,12 +71,11 @@ class LongcatFlashConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie input and output embeddings. - rope_theta (`float`, *optional*, defaults to 10000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): + rope_parameters (`RopeParameters`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -141,38 +142,37 @@ class LongcatFlashConfig(PreTrainedConfig): def __init__( self, - vocab_size=131072, - hidden_size=6144, - num_hidden_layers=56, - num_layers=28, - num_attention_heads=64, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - ffn_hidden_size=12288, - q_lora_rank=1536, - kv_lora_rank=512, - qk_nope_head_dim=128, - qk_rope_head_dim=64, - head_dim=64, - v_head_dim=128, - qk_head_dim=None, - moe_topk=12, - n_routed_experts=512, - zero_expert_num=256, - expert_ffn_hidden_size=2048, - routed_scaling_factor=6.0, + vocab_size: Optional[int] = 131072, + hidden_size: Optional[int] = 6144, + num_hidden_layers: Optional[int] = 56, + num_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + ffn_hidden_size: Optional[int] = 12288, + q_lora_rank: Optional[int] = 1536, + kv_lora_rank: Optional[int] = 512, + qk_nope_head_dim: Optional[int] = 128, + qk_rope_head_dim: Optional[int] = 64, + head_dim: Optional[int] = 64, + v_head_dim: Optional[int] = 128, + qk_head_dim: Optional[int] = None, + moe_topk: Optional[int] = 12, + n_routed_experts: Optional[int] = 512, + zero_expert_num: Optional[int] = 256, + expert_ffn_hidden_size: Optional[int] = 2048, + routed_scaling_factor: Optional[float] = 6.0, **kwargs, ): if num_key_value_heads is None: @@ -192,8 +192,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -212,14 +210,17 @@ def __init__( self.zero_expert_num = zero_expert_num self.expert_ffn_hidden_size = expert_ffn_hidden_size self.routed_scaling_factor = routed_scaling_factor + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000000.0) + standardize_rope_params(self, rope_theta=rope_theta) - if self.rope_scaling is not None: - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_scaling: - self.rope_scaling[key] = float(self.rope_scaling[key]) + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) rope_config_validation(self) diff --git a/src/transformers/models/longcat_flash/modeling_longcat_flash.py b/src/transformers/models/longcat_flash/modeling_longcat_flash.py index 334666d33b72..c082eb43ee4d 100644 --- a/src/transformers/models/longcat_flash/modeling_longcat_flash.py +++ b/src/transformers/models/longcat_flash/modeling_longcat_flash.py @@ -69,20 +69,49 @@ class LongcatFlashRotaryEmbedding(nn.Module): def __init__(self, config: LongcatFlashConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[LongcatFlashConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -291,7 +320,7 @@ def __init__(self, config, layer_idx: int): self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads self.attention_dropout = config.attention_dropout self.num_heads = config.num_attention_heads - self.rope_theta = config.rope_theta + self.q_lora_rank = config.q_lora_rank self.qk_rope_head_dim = config.qk_rope_head_dim self.kv_lora_rank = config.kv_lora_rank @@ -326,9 +355,9 @@ def __init__(self, config, layer_idx: int): ) self.scaling = self.qk_head_dim ** (-0.5) - if self.config.rope_scaling is not None: - mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0) - scaling_factor = self.config.rope_scaling["factor"] + if self.config.rope_parameters.get("rope_type", "default") != "default": + mscale_all_dim = self.config.rope_parameters.get("mscale_all_dim", 0) + scaling_factor = self.config.rope_parameters["factor"] if mscale_all_dim: mscale = yarn_get_mscale(scaling_factor, mscale_all_dim) self.scaling = self.scaling * mscale * mscale diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 482aaa854d0f..733221273016 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -15,10 +15,12 @@ """Mimi model configuration""" import math +from typing import Optional import numpy as np from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -113,8 +115,10 @@ class MimiConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. use_streaming (`bool`, *optional*, defaults to `False`): Whether to use streaming mode. If `True`, the model encode method will return the padding cache that can be used in a subsequent call to the encode method. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 250): Sliding window attention window size. If not specified, will default to `250`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -142,44 +146,44 @@ class MimiConfig(PreTrainedConfig): def __init__( self, - sampling_rate=24_000, - frame_rate=None, - audio_channels=1, - hidden_size=512, - num_filters=64, - num_residual_layers=1, - upsampling_ratios=None, - kernel_size=7, - last_kernel_size=3, - residual_kernel_size=3, - dilation_growth_rate=2, - use_causal_conv=True, - pad_mode="constant", - compress=2, - trim_right_ratio=1.0, - codebook_size=2048, - codebook_dim=256, - num_quantizers=32, - use_conv_shortcut=False, - vector_quantization_hidden_dimension=256, - num_semantic_quantizers=1, - upsample_groups=512, - num_hidden_layers=8, - intermediate_size=2048, - num_attention_heads=8, - num_key_value_heads=8, - head_dim=None, - hidden_act="gelu", - max_position_embeddings=8000, - initializer_range=0.02, - norm_eps=1e-5, - use_cache=False, - use_streaming=False, - rope_theta=10000.0, - sliding_window=250, - attention_dropout=0.0, - layer_scale_initial_scale=0.01, - attention_bias=False, + sampling_rate: Optional[int] = 24_000, + frame_rate: Optional[int] = None, + audio_channels: Optional[int] = 1, + hidden_size: Optional[int] = 512, + num_filters: Optional[int] = 64, + num_residual_layers: Optional[int] = 1, + upsampling_ratios: Optional[list[int]] = None, + kernel_size: Optional[int] = 7, + last_kernel_size: Optional[int] = 3, + residual_kernel_size: Optional[int] = 3, + dilation_growth_rate: Optional[int] = 2, + use_causal_conv: Optional[bool] = True, + pad_mode: Optional[str] = "constant", + compress: Optional[int] = 2, + trim_right_ratio: Optional[float] = 1.0, + codebook_size: Optional[int] = 2048, + codebook_dim: Optional[int] = 256, + num_quantizers: Optional[int] = 32, + use_conv_shortcut: Optional[bool] = False, + vector_quantization_hidden_dimension: Optional[int] = 256, + num_semantic_quantizers: Optional[int] = 1, + upsample_groups: Optional[int] = 512, + num_hidden_layers: Optional[int] = 8, + intermediate_size: Optional[int] = 2048, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "gelu", + max_position_embeddings: Optional[int] = 8000, + initializer_range: Optional[float] = 0.02, + norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = False, + use_streaming: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + sliding_window: Optional[int] = 250, + attention_dropout: Optional[float] = 0.0, + layer_scale_initial_scale: Optional[float] = 0.01, + attention_bias: Optional[bool] = False, **kwargs, ): self.sampling_rate = sampling_rate @@ -212,12 +216,19 @@ def __init__( self.norm_eps = norm_eps self.use_cache = use_cache self.use_streaming = use_streaming - self.rope_theta = rope_theta self.sliding_window = sliding_window self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads self.layer_scale_initial_scale = layer_scale_initial_scale self.attention_bias = attention_bias + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/mimi/modeling_mimi.py b/src/transformers/models/mimi/modeling_mimi.py index 686569d42a8d..d07d2941876b 100644 --- a/src/transformers/models/mimi/modeling_mimi.py +++ b/src/transformers/models/mimi/modeling_mimi.py @@ -15,6 +15,7 @@ """PyTorch Mimi model.""" import math +from collections.abc import Callable from dataclasses import dataclass from typing import Optional, Union @@ -500,26 +501,55 @@ def forward(self, x: torch.Tensor): return self.scale * x -# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Mimi +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mimi class MimiRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: MimiConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MimiConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -625,7 +655,7 @@ def __init__(self, config: MimiConfig, layer_idx: Optional[int] = None): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.is_causal = True self.scaling = 1 / math.sqrt(config.head_dim) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index d12264e2ae49..8c4737cc5b67 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -19,7 +19,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class MiniMaxConfig(PreTrainedConfig): @@ -75,8 +79,6 @@ class MiniMaxConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -93,6 +95,10 @@ class MiniMaxConfig(PreTrainedConfig): The aux loss factor for the total loss. router_jitter_noise (`float`, *optional*, defaults to 0.0): Amount of noise to add to the router. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. layer_types (`list`, *optional*): Attention pattern for each layer. block_size (`int`, *optional*, defaults to 256): @@ -147,48 +153,40 @@ class MiniMaxConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - head_dim=None, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=1e6, - sliding_window=None, - attention_dropout=0.0, - num_experts_per_tok=2, - num_local_experts=8, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.0, - layer_types=None, - block_size=256, - full_attn_alpha_factor=1, - full_attn_beta_factor=1, - linear_attn_alpha_factor=1, - linear_attn_beta_factor=1, - mlp_alpha_factor=1, - mlp_beta_factor=1, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 2, + num_local_experts: Optional[int] = 8, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + router_jitter_noise: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + layer_types: Optional[list[str]] = None, + block_size: Optional[int] = 256, + full_attn_alpha_factor: Optional[int] = 1, + full_attn_beta_factor: Optional[int] = 1, + linear_attn_alpha_factor: Optional[int] = 1, + linear_attn_beta_factor: Optional[int] = 1, + mlp_alpha_factor: Optional[int] = 1, + mlp_beta_factor: Optional[int] = 1, **kwargs, ): - self.layer_types = layer_types - self.block_size = block_size - self.full_attn_alpha_factor = full_attn_alpha_factor - self.full_attn_beta_factor = full_attn_beta_factor - self.linear_attn_alpha_factor = linear_attn_alpha_factor - self.linear_attn_beta_factor = linear_attn_beta_factor - self.mlp_alpha_factor = mlp_alpha_factor - self.mlp_beta_factor = mlp_beta_factor self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -206,7 +204,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.head_dim = head_dim @@ -215,6 +212,29 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise + + self.layer_types = layer_types + self.block_size = block_size + self.full_attn_alpha_factor = full_attn_alpha_factor + self.full_attn_beta_factor = full_attn_beta_factor + self.linear_attn_alpha_factor = linear_attn_alpha_factor + self.linear_attn_beta_factor = linear_attn_beta_factor + self.mlp_alpha_factor = mlp_alpha_factor + self.mlp_beta_factor = mlp_beta_factor + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + if self.layer_types is None: + self.layer_types = [ + "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -222,11 +242,6 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - if self.layer_types is None: - self.layer_types = [ - "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) - ] - layer_type_validation(self.layer_types, self.num_hidden_layers) __all__ = ["MiniMaxConfig"] diff --git a/src/transformers/models/minimax/modeling_minimax.py b/src/transformers/models/minimax/modeling_minimax.py index d1df643bc2b2..84573a213a9f 100644 --- a/src/transformers/models/minimax/modeling_minimax.py +++ b/src/transformers/models/minimax/modeling_minimax.py @@ -253,6 +253,71 @@ def forward( return attn_output, attn_weights_inter +class MiniMaxRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: MiniMaxConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MiniMaxConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -478,7 +543,7 @@ def __init__(self, config: MiniMaxConfig, layer_idx: int): self.post_attention_layernorm = MiniMaxRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.layer_idx = layer_idx - self.layer_type = config.layer_types[layer_idx] + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.mlp_alpha_factor = config.mlp_alpha_factor self.mlp_beta_factor = config.mlp_beta_factor @@ -494,7 +559,7 @@ def __init__(self, config: MiniMaxConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -542,42 +607,6 @@ class MiniMaxPreTrainedModel(PreTrainedModel): } -class MiniMaxRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: MiniMaxConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class MiniMaxModel(MiniMaxPreTrainedModel): def __init__(self, config: MiniMaxConfig): @@ -640,8 +669,6 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers: @@ -653,8 +680,8 @@ def forward( hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=input_attention_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 4afe2b57bf83..50a42c9d5cec 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -23,15 +23,16 @@ from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import layer_type_validation +from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs -from ..mixtral.configuration_mixtral import MixtralConfig +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..mixtral.modeling_mixtral import ( MixtralAttention, MixtralDecoderLayer, @@ -49,7 +50,7 @@ logger = logging.get_logger(__name__) -class MiniMaxConfig(MixtralConfig): +class MiniMaxConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -102,8 +103,6 @@ class MiniMaxConfig(MixtralConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -120,6 +119,10 @@ class MiniMaxConfig(MixtralConfig): The aux loss factor for the total loss. router_jitter_noise (`float`, *optional*, defaults to 0.0): Amount of noise to add to the router. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. layer_types (`list`, *optional*): Attention pattern for each layer. block_size (`int`, *optional*, defaults to 256): @@ -151,18 +154,89 @@ class MiniMaxConfig(MixtralConfig): >>> configuration = model.config ```""" + model_type = "minimax" + keys_to_ignore_at_inference = ["past_key_values"] + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.block_sparse_moe.gate": "colwise_rep", # we need to replicate here to correctly route experts + "layers.*.block_sparse_moe.experts.*.w1": "colwise", + "layers.*.block_sparse_moe.experts.*.w2": "rowwise", + "layers.*.block_sparse_moe.experts.*.w3": "colwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + attribute_map = { + "num_experts": "num_local_experts", + } + def __init__( self, - layer_types=None, - block_size=256, - full_attn_alpha_factor=1, - full_attn_beta_factor=1, - linear_attn_alpha_factor=1, - linear_attn_beta_factor=1, - mlp_alpha_factor=1, - mlp_beta_factor=1, - **super_kwargs, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 2, + num_local_experts: Optional[int] = 8, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + router_jitter_noise: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + layer_types: Optional[list[str]] = None, + block_size: Optional[int] = 256, + full_attn_alpha_factor: Optional[int] = 1, + full_attn_beta_factor: Optional[int] = 1, + linear_attn_alpha_factor: Optional[int] = 1, + linear_attn_beta_factor: Optional[int] = 1, + mlp_alpha_factor: Optional[int] = 1, + mlp_beta_factor: Optional[int] = 1, + **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_dropout = attention_dropout + self.head_dim = head_dim + + self.num_experts_per_tok = num_experts_per_tok + self.num_local_experts = num_local_experts + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.router_jitter_noise = router_jitter_noise + self.layer_types = layer_types self.block_size = block_size self.full_attn_alpha_factor = full_attn_alpha_factor @@ -171,14 +245,28 @@ def __init__( self.linear_attn_beta_factor = linear_attn_beta_factor self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters - super().__init__(**super_kwargs) if self.layer_types is None: self.layer_types = [ "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + class MiniMaxRMSNorm(MixtralRMSNorm): pass @@ -368,6 +456,10 @@ def forward( return attn_output, attn_weights_inter +class MiniMaxRotaryEmbedding(Gemma2RotaryEmbedding): + pass + + class MiniMaxAttention(MixtralAttention): pass @@ -381,7 +473,7 @@ def __init__(self, config: MiniMaxConfig, layer_idx: int): super().__init__(config, layer_idx) self.layer_idx = layer_idx - self.layer_type = config.layer_types[layer_idx] + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.mlp_alpha_factor = config.mlp_alpha_factor self.mlp_beta_factor = config.mlp_beta_factor @@ -397,7 +489,7 @@ def __init__(self, config: MiniMaxConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -480,8 +572,6 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers: @@ -493,8 +583,8 @@ def forward( hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=input_attention_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 0133b129d9be..3f286cd69a9f 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -4,7 +4,10 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_ministral.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class MinistralConfig(PreTrainedConfig): @@ -61,8 +64,10 @@ class MinistralConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 4096): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -103,26 +108,26 @@ class MinistralConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - head_dim=None, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, - layer_types=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters] = None, + sliding_window: Optional[int] = 4096, + attention_dropout: Optional[float] = 0.0, + layer_types: Optional[list[str]] = None, **kwargs, ): super().__init__( @@ -150,14 +155,21 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modeling_ministral.py b/src/transformers/models/ministral/modeling_ministral.py index 334e9661135c..239d2fc2047b 100644 --- a/src/transformers/models/ministral/modeling_ministral.py +++ b/src/transformers/models/ministral/modeling_ministral.py @@ -124,6 +124,7 @@ class MinistralAttention(nn.Module): def __init__(self, config, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -136,7 +137,7 @@ def __init__(self, config, layer_idx: int): self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=False) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -224,7 +225,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -274,20 +275,49 @@ class MinistralRotaryEmbedding(nn.Module): def __init__(self, config: MinistralConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MinistralConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -372,8 +402,6 @@ def forward( } hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 13130dae29b2..f79600e82974 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -7,6 +7,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs @@ -80,8 +81,10 @@ class MinistralConfig(MistralConfig, PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 4096): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -106,26 +109,26 @@ class MinistralConfig(MistralConfig, PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - head_dim=None, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, - layer_types=None, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters] = None, + sliding_window: Optional[int] = 4096, + attention_dropout: Optional[float] = 0.0, + layer_types: Optional[list[str]] = None, **kwargs, ): PreTrainedConfig.__init__( @@ -154,15 +157,22 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class MinistralMLP(Qwen2MLP): pass @@ -247,8 +257,6 @@ def forward( } hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index bbc0c68b0e4e..e17bd2a65423 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -14,7 +14,10 @@ # limitations under the License. """Mistral model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -75,8 +78,10 @@ class MistralConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*, defaults to 4096): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -115,25 +120,25 @@ class MistralConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - head_dim=None, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - sliding_window=4096, - attention_dropout=0.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + sliding_window: Optional[int] = 4096, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -154,7 +159,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout if "layer_types" in kwargs: @@ -162,6 +166,15 @@ def __init__( "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." ) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py index 279403418bed..ab3cae55bb6e 100644 --- a/src/transformers/models/mistral/modeling_mistral.py +++ b/src/transformers/models/mistral/modeling_mistral.py @@ -220,7 +220,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -270,20 +270,49 @@ class MistralRotaryEmbedding(nn.Module): def __init__(self, config: MistralConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MistralConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -361,7 +390,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( diff --git a/src/transformers/models/mistral/modular_mistral.py b/src/transformers/models/mistral/modular_mistral.py index c2686afa013f..709ff855c399 100644 --- a/src/transformers/models/mistral/modular_mistral.py +++ b/src/transformers/models/mistral/modular_mistral.py @@ -153,7 +153,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 06cc29fd92a2..9a8e2280c252 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -14,7 +14,10 @@ # limitations under the License. """Mixtral model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -75,8 +78,6 @@ class MixtralConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `4096`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -93,6 +94,10 @@ class MixtralConfig(PreTrainedConfig): The aux loss factor for the total loss. router_jitter_noise (`float`, *optional*, defaults to 0.0): Amount of noise to add to the router. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. ```python >>> from transformers import MixtralModel, MixtralConfig @@ -130,30 +135,30 @@ class MixtralConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=14336, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - head_dim=None, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=1e6, - sliding_window=None, - attention_dropout=0.0, - num_experts_per_tok=2, - num_local_experts=8, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.0, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 14336, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 2, + num_local_experts: Optional[int] = 8, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + router_jitter_noise: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -173,7 +178,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.head_dim = head_dim @@ -182,6 +186,15 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index b63889d09bc8..a8fa4ed5619d 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -153,6 +153,71 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" +class MixtralRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: MixtralConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MixtralConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -301,7 +366,7 @@ def __init__(self, config: MixtralConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -327,42 +392,6 @@ def forward( return hidden_states -class MixtralRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: MixtralConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class MixtralPreTrainedModel(PreTrainedModel): config: MixtralConfig @@ -441,19 +470,17 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/mixtral/modular_mixtral.py b/src/transformers/models/mixtral/modular_mixtral.py index 7c394c744e64..58a9100d388e 100644 --- a/src/transformers/models/mixtral/modular_mixtral.py +++ b/src/transformers/models/mixtral/modular_mixtral.py @@ -214,6 +214,10 @@ class MixtralRMSNorm(MistralRMSNorm): pass +class MixtralRotaryEmbedding(MistralRotaryEmbedding): + pass + + class MixtralAttention(MistralAttention): pass @@ -232,7 +236,7 @@ def __init__(self, config: MixtralConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -258,10 +262,6 @@ def forward( return hidden_states -class MixtralRotaryEmbedding(MistralRotaryEmbedding): - pass - - class MixtralPreTrainedModel(MistralPreTrainedModel): _can_compile_fullgraph = False # MoE models don't work with torch.compile (`torch.where(condition)` not supported) _can_record_outputs = { @@ -311,19 +311,17 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 498cfa930ec3..85be3701a84a 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import rope_config_validation, standardize_rope_params from ...utils import logging @@ -166,45 +166,10 @@ class MllamaTextConfig(PreTrainedConfig): specified, will default to `num_attention_heads`. intermediate_size (`int`, *optional*, defaults to 14336): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - rope_theta (`float`, *optional*, defaults to `500000.0`): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. rms_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the rms normalization layers. max_position_embeddings (`int`, *optional*, defaults to 131072): @@ -253,8 +218,7 @@ def __init__( num_attention_heads: int = 32, num_key_value_heads: int = 8, intermediate_size: int = 14_336, - rope_theta: float = 500_000, - rope_scaling: Optional[dict] = None, + rope_parameters: Optional[dict] = None, rms_norm_eps: float = 1e-5, max_position_embeddings: int = 131_072, initializer_range: float = 0.02, @@ -278,13 +242,18 @@ def __init__( self.num_key_value_heads = num_key_value_heads self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta self.rms_norm_eps = rms_norm_eps self.intermediate_size = intermediate_size self.dropout = dropout self.hidden_act = hidden_act - self.rope_scaling = rope_scaling self.max_position_embeddings = max_position_embeddings + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 500000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py index c773d0514f81..13729fc3bbf4 100644 --- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py +++ b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py @@ -235,7 +235,7 @@ def write_model( cross_attention_num_layers = params["vision_num_cross_attention_layers"] # some constants from original code - rope_scaling = { + rope_parameters = { "rope_type": "llama3", "factor": 8.0, "low_freq_factor": 1.0, @@ -280,7 +280,7 @@ def write_model( cross_attention_layers=cross_attention_layers_shift, intermediate_size=text_intermediate_size, max_position_embeddings=max_position_embeddings, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/modeling_mllama.py b/src/transformers/models/mllama/modeling_mllama.py index 7f3bf4aa592a..7f4b0f6ebb2c 100644 --- a/src/transformers/models/mllama/modeling_mllama.py +++ b/src/transformers/models/mllama/modeling_mllama.py @@ -29,7 +29,10 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging @@ -517,7 +520,7 @@ def __init__(self, config: MllamaTextConfig, layer_idx: int): self.head_dim = config.hidden_size // self.num_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.scaling = self.head_dim**-0.5 - self.rope_theta = config.rope_theta + self.layer_idx = layer_idx self.is_causal = True @@ -534,6 +537,7 @@ def forward( use_cache: bool = False, past_key_values=None, cache_position=None, + position_ids=None, **kwargs, ): bsz, q_len, _ = hidden_states.size() @@ -619,7 +623,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -720,21 +724,57 @@ def forward( return hidden_states +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with LlamaConfig->MllamaTextConfig,Llama->Mllama class MllamaRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: MllamaTextConfig, device=None): super().__init__() - self.rope_type = config.rope_scaling["rope_type"] self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MllamaTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + # Ignore copy @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): @@ -1155,6 +1195,7 @@ def __init__(self, config: MllamaTextConfig): self.layers = nn.ModuleList(layers) self.norm = MllamaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.rotary_emb = MllamaRotaryEmbedding(config=config) + self.gradient_checkpointing = False self.post_init() @@ -1236,9 +1277,7 @@ def forward( position_ids = cache_position.unsqueeze(0) causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_key_values) - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers for idx, decoder_layer in enumerate(self.layers): diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index deddab1d7f5b..6d378425284d 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -19,9 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Literal +from typing import Literal, Optional -from ...configuration_utils import PreTrainedConfig +from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class ModernBertConfig(PreTrainedConfig): @@ -69,18 +70,18 @@ class ModernBertConfig(PreTrainedConfig): Classification token id. sep_token_id (`int`, *optional*, defaults to 50282): Separation token id. - global_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the global RoPE embeddings. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - global_attn_every_n_layers (`int`, *optional*, defaults to 3): - The number of layers between global attention layers. + layer_types (`list`, *optional*): + Attention pattern for each layer. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. local_attention (`int`, *optional*, defaults to 128): The window size for local attention. - local_rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the local RoPE embeddings. embedding_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the embeddings. mlp_bias (`bool`, *optional*, defaults to `False`): @@ -134,41 +135,40 @@ class ModernBertConfig(PreTrainedConfig): def __init__( self, - vocab_size=50368, - hidden_size=768, - intermediate_size=1152, - num_hidden_layers=22, - num_attention_heads=12, - hidden_activation="gelu", - max_position_embeddings=8192, - initializer_range=0.02, - initializer_cutoff_factor=2.0, - norm_eps=1e-5, - norm_bias=False, - pad_token_id=50283, - eos_token_id=50282, - bos_token_id=50281, - cls_token_id=50281, - sep_token_id=50282, - global_rope_theta=160000.0, - attention_bias=False, - attention_dropout=0.0, - global_attn_every_n_layers=3, - local_attention=128, - local_rope_theta=10000.0, - embedding_dropout=0.0, - mlp_bias=False, - mlp_dropout=0.0, - decoder_bias=True, + vocab_size: Optional[int] = 50368, + hidden_size: Optional[int] = 768, + intermediate_size: Optional[int] = 1152, + num_hidden_layers: Optional[int] = 22, + num_attention_heads: Optional[int] = 12, + hidden_activation: Optional[str] = "gelu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + initializer_cutoff_factor: Optional[float] = 2.0, + norm_eps: Optional[int] = 1e-5, + norm_bias: Optional[bool] = False, + pad_token_id: Optional[int] = 50283, + eos_token_id: Optional[int] = 50282, + bos_token_id: Optional[int] = 50281, + cls_token_id: Optional[int] = 50281, + sep_token_id: Optional[int] = 50282, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + layer_types: Optional[list[str]] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + local_attention: Optional[int] = 128, + embedding_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + mlp_dropout: Optional[float] = 0.0, + decoder_bias: Optional[bool] = True, classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout=0.0, - classifier_bias=False, - classifier_activation="gelu", - deterministic_flash_attn=False, - sparse_prediction=False, - sparse_pred_ignore_index=-100, - reference_compile=None, - repad_logits_with_grad=False, + classifier_dropout: Optional[float] = 0.0, + classifier_bias: Optional[bool] = False, + classifier_activation: Optional[str] = "gelu", + deterministic_flash_attn: Optional[bool] = False, + sparse_prediction: Optional[bool] = False, + sparse_pred_ignore_index: Optional[int] = -100, + reference_compile: Optional[bool] = None, + repad_logits_with_grad: Optional[bool] = False, **kwargs, ): super().__init__( @@ -189,13 +189,10 @@ def __init__( self.initializer_cutoff_factor = initializer_cutoff_factor self.norm_eps = norm_eps self.norm_bias = norm_bias - self.global_rope_theta = global_rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation - self.global_attn_every_n_layers = global_attn_every_n_layers self.local_attention = local_attention - self.local_rope_theta = local_rope_theta self.embedding_dropout = embedding_dropout self.mlp_bias = mlp_bias self.mlp_dropout = mlp_dropout @@ -209,12 +206,35 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.' ) + self.layer_types = layer_types + + # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub + self.global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) + + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if bool(i % self.global_attn_every_n_layers) else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "global_rope_theta", 160_000.0) + rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py index dbe835ea9336..c363eaefcf3c 100644 --- a/src/transformers/models/modernbert/modeling_modernbert.py +++ b/src/transformers/models/modernbert/modeling_modernbert.py @@ -19,8 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import math +from collections.abc import Callable from contextlib import nullcontext from typing import Optional, Union @@ -247,33 +247,78 @@ class ModernBertRotaryEmbedding(nn.Module): def __init__(self, config: ModernBertConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.layer_types = list(set(config.layer_types)) + self.rope_type = {} + for layer_type in self.layer_types: + rope_params = self.config.rope_parameters[layer_type] + if rope_params is None: + continue + + self.rope_type[layer_type] = rope_params["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type[layer_type] != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] + curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) + self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) + setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq) + setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ModernBertConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + layer_type (`str`, *optional*): + The current layer type if the model has different RoPE parameters per type. + Should not be used unless `config.layer_types is not None` + + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + base = config.rope_parameters[layer_type]["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + def forward(self, x, position_ids, layer_type=None): + inv_freq = getattr(self, f"{layer_type}_inv_freq") + attention_scaling = getattr(self, f"{layer_type}_attention_scaling") + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling + cos = emb.cos() * attention_scaling + sin = emb.sin() * attention_scaling return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @@ -321,11 +366,12 @@ def eager_attention_forward( local_attention: tuple[int, int], bs: int, dim: int, + position_embeddings: torch.Tensor, output_attentions: Optional[bool] = False, **_kwargs, ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: # qkv: [batch_size, seqlen, 3, nheads, headdim] - cos, sin = module.rotary_emb(qkv, position_ids=position_ids) + cos, sin = position_embeddings query, key, value = qkv.transpose(3, 1).unbind(dim=2) # query, key, value: [batch_size, heads, seq_len, head_dim] query, key = apply_rotary_pos_emb(query, key, cos, sin) @@ -401,10 +447,11 @@ def sdpa_attention_forward( local_attention: tuple[int, int], bs: int, dim: int, + position_embeddings: torch.Tensor, **_kwargs, ) -> tuple[torch.Tensor]: # qkv: [batch_size, seqlen, 3, nheads, headdim] - cos, sin = module.rotary_emb(qkv, position_ids=position_ids) + cos, sin = position_embeddings query, key, value = qkv.transpose(3, 1).unbind(dim=2) # query, key, value: [batch_size, heads, seq_len, head_dim] query, key = apply_rotary_pos_emb(query, key, cos, sin) @@ -460,24 +507,25 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): self.head_dim = config.hidden_size // config.num_attention_heads self.all_head_size = self.head_dim * self.num_heads self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attention_bias) + layer_type = config.layer_types[layer_id] if layer_id % config.global_attn_every_n_layers != 0: self.local_attention = (config.local_attention // 2, config.local_attention // 2) - rope_theta = config.local_rope_theta if config.local_rope_theta is not None else config.global_rope_theta max_position_embeddings = config.local_attention else: self.local_attention = (-1, -1) max_position_embeddings = config.max_position_embeddings - rope_theta = config.global_rope_theta if config._attn_implementation == "flash_attention_2": + rope_parameters_dict = ( + self.config.rope_parameters[layer_type] if layer_type is not None else self.config.rope_parameters + ) + rope_theta = rope_parameters_dict["rope_theta"] self.rotary_emb = ModernBertUnpaddedRotaryEmbedding( dim=self.head_dim, max_seqlen=max_position_embeddings, base=rope_theta ) else: - config_copy = copy.deepcopy(config) - config_copy.rope_theta = rope_theta - self.rotary_emb = ModernBertRotaryEmbedding(config=config_copy) + self.rotary_emb = None self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias) self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity() @@ -485,6 +533,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> torch.Tensor: @@ -503,6 +552,7 @@ def forward( local_attention=self.local_attention, bs=bs, dim=self.all_head_size, + position_embeddings=position_embeddings, output_attentions=output_attentions, **kwargs, ) @@ -523,6 +573,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): self.attn = ModernBertAttention(config=config, layer_id=layer_id) self.mlp_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) self.mlp = ModernBertMLP(config) + self.attention_type = config.layer_types[layer_id] @torch.compile(dynamic=True) def compiled_mlp(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -536,6 +587,7 @@ def forward( position_ids: Optional[torch.LongTensor] = None, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None, + position_embeddings: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ) -> torch.Tensor: attn_outputs = self.attn( @@ -545,6 +597,7 @@ def forward( position_ids=position_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + position_embeddings=position_embeddings, output_attentions=output_attentions, ) hidden_states = hidden_states + attn_outputs[0] @@ -769,6 +822,7 @@ def __init__(self, config: ModernBertConfig): [ModernBertEncoderLayer(config, layer_id) for layer_id in range(config.num_hidden_layers)] ) self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) + self.rotary_emb = ModernBertRotaryEmbedding(config=config) self.gradient_checkpointing = False self.post_init() @@ -860,6 +914,9 @@ def forward( ) hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) for encoder_layer in self.layers: if output_hidden_states: @@ -872,6 +929,7 @@ def forward( position_ids=position_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + position_embeddings=position_embeddings[encoder_layer.attention_type], output_attentions=output_attentions, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index d93282bacc10..9e535d345f2f 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import math from contextlib import nullcontext from typing import Literal, Optional, Union @@ -24,7 +23,7 @@ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN -from ...configuration_utils import PreTrainedConfig +from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_attn_mask_utils import _prepare_4d_attention_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import ( @@ -35,10 +34,11 @@ SequenceClassifierOutput, TokenClassifierOutput, ) +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available -from ..gemma.modeling_gemma import GemmaRotaryEmbedding, apply_rotary_pos_emb +from ..gemma3.modeling_gemma3 import Gemma3RotaryEmbedding, apply_rotary_pos_emb if is_flash_attn_2_available(): @@ -97,18 +97,18 @@ class ModernBertConfig(PreTrainedConfig): Classification token id. sep_token_id (`int`, *optional*, defaults to 50282): Separation token id. - global_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the global RoPE embeddings. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - global_attn_every_n_layers (`int`, *optional*, defaults to 3): - The number of layers between global attention layers. + layer_types (`list`, *optional*): + Attention pattern for each layer. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. local_attention (`int`, *optional*, defaults to 128): The window size for local attention. - local_rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the local RoPE embeddings. embedding_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the embeddings. mlp_bias (`bool`, *optional*, defaults to `False`): @@ -162,41 +162,40 @@ class ModernBertConfig(PreTrainedConfig): def __init__( self, - vocab_size=50368, - hidden_size=768, - intermediate_size=1152, - num_hidden_layers=22, - num_attention_heads=12, - hidden_activation="gelu", - max_position_embeddings=8192, - initializer_range=0.02, - initializer_cutoff_factor=2.0, - norm_eps=1e-5, - norm_bias=False, - pad_token_id=50283, - eos_token_id=50282, - bos_token_id=50281, - cls_token_id=50281, - sep_token_id=50282, - global_rope_theta=160000.0, - attention_bias=False, - attention_dropout=0.0, - global_attn_every_n_layers=3, - local_attention=128, - local_rope_theta=10000.0, - embedding_dropout=0.0, - mlp_bias=False, - mlp_dropout=0.0, - decoder_bias=True, + vocab_size: Optional[int] = 50368, + hidden_size: Optional[int] = 768, + intermediate_size: Optional[int] = 1152, + num_hidden_layers: Optional[int] = 22, + num_attention_heads: Optional[int] = 12, + hidden_activation: Optional[str] = "gelu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + initializer_cutoff_factor: Optional[float] = 2.0, + norm_eps: Optional[int] = 1e-5, + norm_bias: Optional[bool] = False, + pad_token_id: Optional[int] = 50283, + eos_token_id: Optional[int] = 50282, + bos_token_id: Optional[int] = 50281, + cls_token_id: Optional[int] = 50281, + sep_token_id: Optional[int] = 50282, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + layer_types: Optional[list[str]] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + local_attention: Optional[int] = 128, + embedding_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + mlp_dropout: Optional[float] = 0.0, + decoder_bias: Optional[bool] = True, classifier_pooling: Literal["cls", "mean"] = "cls", - classifier_dropout=0.0, - classifier_bias=False, - classifier_activation="gelu", - deterministic_flash_attn=False, - sparse_prediction=False, - sparse_pred_ignore_index=-100, - reference_compile=None, - repad_logits_with_grad=False, + classifier_dropout: Optional[float] = 0.0, + classifier_bias: Optional[bool] = False, + classifier_activation: Optional[str] = "gelu", + deterministic_flash_attn: Optional[bool] = False, + sparse_prediction: Optional[bool] = False, + sparse_pred_ignore_index: Optional[int] = -100, + reference_compile: Optional[bool] = None, + repad_logits_with_grad: Optional[bool] = False, **kwargs, ): super().__init__( @@ -217,13 +216,10 @@ def __init__( self.initializer_cutoff_factor = initializer_cutoff_factor self.norm_eps = norm_eps self.norm_bias = norm_bias - self.global_rope_theta = global_rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation - self.global_attn_every_n_layers = global_attn_every_n_layers self.local_attention = local_attention - self.local_rope_theta = local_rope_theta self.embedding_dropout = embedding_dropout self.mlp_bias = mlp_bias self.mlp_dropout = mlp_dropout @@ -237,12 +233,35 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( f'Invalid value for `classifier_pooling`, should be either "cls" or "mean", but is {self.classifier_pooling}.' ) + self.layer_types = layer_types + + # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub + self.global_attn_every_n_layers = kwargs.get("global_attn_every_n_layers", 3) + + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" if bool(i % self.global_attn_every_n_layers) else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types, self.num_hidden_layers) + + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "global_rope_theta", 160_000.0) + rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) @@ -505,8 +524,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return self.Wo(self.drop(self.act(input) * gate)) -class ModernBertRotaryEmbedding(GemmaRotaryEmbedding): - pass +class ModernBertRotaryEmbedding(Gemma3RotaryEmbedding): + def __init__(self, config: ModernBertConfig, device=None): + super().__init__(config, device) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ModernBertConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + return super().compute_default_rope_parameters(config, device, seq_len, layer_type) def eager_attention_forward( @@ -518,11 +547,12 @@ def eager_attention_forward( local_attention: tuple[int, int], bs: int, dim: int, + position_embeddings: torch.Tensor, output_attentions: Optional[bool] = False, **_kwargs, ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]: # qkv: [batch_size, seqlen, 3, nheads, headdim] - cos, sin = module.rotary_emb(qkv, position_ids=position_ids) + cos, sin = position_embeddings query, key, value = qkv.transpose(3, 1).unbind(dim=2) # query, key, value: [batch_size, heads, seq_len, head_dim] query, key = apply_rotary_pos_emb(query, key, cos, sin) @@ -598,10 +628,11 @@ def sdpa_attention_forward( local_attention: tuple[int, int], bs: int, dim: int, + position_embeddings: torch.Tensor, **_kwargs, ) -> tuple[torch.Tensor]: # qkv: [batch_size, seqlen, 3, nheads, headdim] - cos, sin = module.rotary_emb(qkv, position_ids=position_ids) + cos, sin = position_embeddings query, key, value = qkv.transpose(3, 1).unbind(dim=2) # query, key, value: [batch_size, heads, seq_len, head_dim] query, key = apply_rotary_pos_emb(query, key, cos, sin) @@ -657,24 +688,25 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): self.head_dim = config.hidden_size // config.num_attention_heads self.all_head_size = self.head_dim * self.num_heads self.Wqkv = nn.Linear(config.hidden_size, 3 * self.all_head_size, bias=config.attention_bias) + layer_type = config.layer_types[layer_id] if layer_id % config.global_attn_every_n_layers != 0: self.local_attention = (config.local_attention // 2, config.local_attention // 2) - rope_theta = config.local_rope_theta if config.local_rope_theta is not None else config.global_rope_theta max_position_embeddings = config.local_attention else: self.local_attention = (-1, -1) max_position_embeddings = config.max_position_embeddings - rope_theta = config.global_rope_theta if config._attn_implementation == "flash_attention_2": + rope_parameters_dict = ( + self.config.rope_parameters[layer_type] if layer_type is not None else self.config.rope_parameters + ) + rope_theta = rope_parameters_dict["rope_theta"] self.rotary_emb = ModernBertUnpaddedRotaryEmbedding( dim=self.head_dim, max_seqlen=max_position_embeddings, base=rope_theta ) else: - config_copy = copy.deepcopy(config) - config_copy.rope_theta = rope_theta - self.rotary_emb = ModernBertRotaryEmbedding(config=config_copy) + self.rotary_emb = None self.Wo = nn.Linear(config.hidden_size, config.hidden_size, bias=config.attention_bias) self.out_drop = nn.Dropout(config.attention_dropout) if config.attention_dropout > 0.0 else nn.Identity() @@ -682,6 +714,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): def forward( self, hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, **kwargs, ) -> torch.Tensor: @@ -700,6 +733,7 @@ def forward( local_attention=self.local_attention, bs=bs, dim=self.all_head_size, + position_embeddings=position_embeddings, output_attentions=output_attentions, **kwargs, ) @@ -720,6 +754,7 @@ def __init__(self, config: ModernBertConfig, layer_id: Optional[int] = None): self.attn = ModernBertAttention(config=config, layer_id=layer_id) self.mlp_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) self.mlp = ModernBertMLP(config) + self.attention_type = config.layer_types[layer_id] @torch.compile(dynamic=True) def compiled_mlp(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -733,6 +768,7 @@ def forward( position_ids: Optional[torch.LongTensor] = None, cu_seqlens: Optional[torch.Tensor] = None, max_seqlen: Optional[int] = None, + position_embeddings: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ) -> torch.Tensor: attn_outputs = self.attn( @@ -742,6 +778,7 @@ def forward( position_ids=position_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + position_embeddings=position_embeddings, output_attentions=output_attentions, ) hidden_states = hidden_states + attn_outputs[0] @@ -894,6 +931,7 @@ def __init__(self, config: ModernBertConfig): [ModernBertEncoderLayer(config, layer_id) for layer_id in range(config.num_hidden_layers)] ) self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) + self.rotary_emb = ModernBertRotaryEmbedding(config=config) self.gradient_checkpointing = False self.post_init() @@ -985,6 +1023,9 @@ def forward( ) hidden_states = self.embeddings(input_ids=input_ids, inputs_embeds=inputs_embeds) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) for encoder_layer in self.layers: if output_hidden_states: @@ -997,6 +1038,7 @@ def forward( position_ids=position_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, + position_embeddings=position_embeddings[encoder_layer.attention_type], output_attentions=output_attentions, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 864ae980817c..cc17f6ce6711 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -19,7 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class ModernBertDecoderConfig(PreTrainedConfig): @@ -67,8 +70,6 @@ class ModernBertDecoderConfig(PreTrainedConfig): Classification token id. sep_token_id (`int`, *optional*, defaults to 50282): Separation token id. - global_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the global RoPE embeddings. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -95,11 +96,13 @@ class ModernBertDecoderConfig(PreTrainedConfig): the decoder to match ModernBERT this is actually half of the sliding window size, so 128 => 64. global_attn_every_n_layers (`int`, *optional*, defaults to 3): Every `global_attn_every_n_layers` layers will use global attention instead of local attention. - local_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the local RoPE embeddings. If not specified, defaults to 160000.0 - layer_types (`list`, *optional*): + layer_types (`list[str]`, *optional*): List of layer types, one for each layer. If not specified, will be automatically generated based on `global_attn_every_n_layers`. Should contain "full_attention" or "sliding_attention". + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. Examples: @@ -122,37 +125,36 @@ class ModernBertDecoderConfig(PreTrainedConfig): def __init__( self, - vocab_size=50368, - hidden_size=768, - intermediate_size=1152, - num_hidden_layers=22, - num_attention_heads=12, - hidden_activation="gelu", - max_position_embeddings=8192, - initializer_range=0.02, - initializer_cutoff_factor=2.0, - norm_eps=1e-5, - norm_bias=False, - pad_token_id=50283, - eos_token_id=50282, - bos_token_id=50281, - cls_token_id=50281, - sep_token_id=50282, - global_rope_theta=160000.0, - attention_bias=False, - attention_dropout=0.0, - embedding_dropout=0.0, - mlp_bias=False, - mlp_dropout=0.0, - decoder_bias=True, - classifier_dropout=0.0, - classifier_bias=False, - classifier_activation="gelu", - use_cache=True, - local_attention=128, - global_attn_every_n_layers=3, - local_rope_theta=160000.0, - layer_types=None, + vocab_size: Optional[int] = 50368, + hidden_size: Optional[int] = 768, + intermediate_size: Optional[int] = 1152, + num_hidden_layers: Optional[int] = 22, + num_attention_heads: Optional[int] = 12, + hidden_activation: Optional[str] = "gelu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + initializer_cutoff_factor: Optional[float] = 2.0, + norm_eps: Optional[int] = 1e-5, + norm_bias: Optional[bool] = False, + pad_token_id: Optional[int] = 50283, + eos_token_id: Optional[int] = 50282, + bos_token_id: Optional[int] = 50281, + cls_token_id: Optional[int] = 50281, + sep_token_id: Optional[int] = 50282, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + embedding_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + mlp_dropout: Optional[float] = 0.0, + decoder_bias: Optional[bool] = True, + classifier_dropout: Optional[float] = 0.0, + classifier_bias: Optional[bool] = False, + classifier_activation: Optional[str] = "gelu", + use_cache: Optional[bool] = True, + local_attention: Optional[int] = 128, + global_attn_every_n_layers: Optional[int] = 3, + layer_types: Optional[list[str]] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): super().__init__( @@ -173,7 +175,6 @@ def __init__( self.initializer_cutoff_factor = initializer_cutoff_factor self.norm_eps = norm_eps self.norm_bias = norm_bias - self.global_rope_theta = global_rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -186,7 +187,9 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - self.local_rope_theta = local_rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -201,6 +204,14 @@ def __init__( else: self.layer_types.append("full_attention") + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "global_rope_theta", 160_000.0) + rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 diff --git a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py index df1acd0e04d5..bb5c8dad9fa4 100644 --- a/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modeling_modernbert_decoder.py @@ -99,33 +99,78 @@ class ModernBertDecoderRotaryEmbedding(nn.Module): def __init__(self, config: ModernBertDecoderConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.layer_types = list(set(config.layer_types)) + self.rope_type = {} + for layer_type in self.layer_types: + rope_params = self.config.rope_parameters[layer_type] + if rope_params is None: + continue + + self.rope_type[layer_type] = rope_params["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type[layer_type] != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type[layer_type]] + curr_inv_freq, curr_attention_scaling = rope_init_fn(self.config, device, layer_type=layer_type) + self.register_buffer(f"{layer_type}_inv_freq", curr_inv_freq, persistent=False) + setattr(self, f"{layer_type}_original_inv_freq", curr_inv_freq) + setattr(self, f"{layer_type}_attention_scaling", curr_attention_scaling) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[ModernBertDecoderConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + layer_type: Optional[str] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + layer_type (`str`, *optional*): + The current layer type if the model has different RoPE parameters per type. + Should not be used unless `config.layer_types is not None` + + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # For backward compatibility standardize the `rope_parameters_dict` if it uses old format + base = config.rope_parameters[layer_type]["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + def forward(self, x, position_ids, layer_type=None): + inv_freq = getattr(self, f"{layer_type}_inv_freq") + attention_scaling = getattr(self, f"{layer_type}_attention_scaling") + + inv_freq_expanded = inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling + cos = emb.cos() * attention_scaling + sin = emb.sin() * attention_scaling return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @@ -290,23 +335,15 @@ def __init__(self, config: ModernBertDecoderConfig, layer_idx: Optional[int] = N def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, + position_embeddings: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states hidden_states = self.attn_norm(hidden_states) - # apply global RoPE to non-sliding layer only - if self.attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - # Self Attention attn_outputs = self.attn( hidden_states=hidden_states, @@ -414,11 +451,9 @@ def __init__(self, config: ModernBertDecoderConfig): [ModernBertDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) + self.rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) self.gradient_checkpointing = False - self.global_rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) - self.local_rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) - self.post_init() def get_input_embeddings(self): @@ -484,19 +519,19 @@ def forward( "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), } - # create position embeddings to be shared across the decoder layers - position_embeddings_global = self.global_rotary_emb(hidden_states, position_ids) - position_embeddings_local = self.local_rotary_emb(hidden_states, position_ids) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) for idx, decoder_layer in enumerate(self.layers): hidden_states = decoder_layer( hidden_states, - position_embeddings_global=position_embeddings_global, - position_embeddings_local=position_embeddings_local, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings[decoder_layer.attention_type], past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 8e588599aac3..ffa7da7c130a 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -27,6 +27,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -89,8 +90,6 @@ class ModernBertDecoderConfig(PreTrainedConfig): Classification token id. sep_token_id (`int`, *optional*, defaults to 50282): Separation token id. - global_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the global RoPE embeddings. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -117,11 +116,13 @@ class ModernBertDecoderConfig(PreTrainedConfig): the decoder to match ModernBERT this is actually half of the sliding window size, so 128 => 64. global_attn_every_n_layers (`int`, *optional*, defaults to 3): Every `global_attn_every_n_layers` layers will use global attention instead of local attention. - local_rope_theta (`float`, *optional*, defaults to 160000.0): - The base period of the local RoPE embeddings. If not specified, defaults to 160000.0 - layer_types (`list`, *optional*): + layer_types (`list[str]`, *optional*): List of layer types, one for each layer. If not specified, will be automatically generated based on `global_attn_every_n_layers`. Should contain "full_attention" or "sliding_attention". + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. Examples: @@ -144,37 +145,36 @@ class ModernBertDecoderConfig(PreTrainedConfig): def __init__( self, - vocab_size=50368, - hidden_size=768, - intermediate_size=1152, - num_hidden_layers=22, - num_attention_heads=12, - hidden_activation="gelu", - max_position_embeddings=8192, - initializer_range=0.02, - initializer_cutoff_factor=2.0, - norm_eps=1e-5, - norm_bias=False, - pad_token_id=50283, - eos_token_id=50282, - bos_token_id=50281, - cls_token_id=50281, - sep_token_id=50282, - global_rope_theta=160000.0, - attention_bias=False, - attention_dropout=0.0, - embedding_dropout=0.0, - mlp_bias=False, - mlp_dropout=0.0, - decoder_bias=True, - classifier_dropout=0.0, - classifier_bias=False, - classifier_activation="gelu", - use_cache=True, - local_attention=128, - global_attn_every_n_layers=3, - local_rope_theta=160000.0, - layer_types=None, + vocab_size: Optional[int] = 50368, + hidden_size: Optional[int] = 768, + intermediate_size: Optional[int] = 1152, + num_hidden_layers: Optional[int] = 22, + num_attention_heads: Optional[int] = 12, + hidden_activation: Optional[str] = "gelu", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + initializer_cutoff_factor: Optional[float] = 2.0, + norm_eps: Optional[int] = 1e-5, + norm_bias: Optional[bool] = False, + pad_token_id: Optional[int] = 50283, + eos_token_id: Optional[int] = 50282, + bos_token_id: Optional[int] = 50281, + cls_token_id: Optional[int] = 50281, + sep_token_id: Optional[int] = 50282, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + embedding_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, + mlp_dropout: Optional[float] = 0.0, + decoder_bias: Optional[bool] = True, + classifier_dropout: Optional[float] = 0.0, + classifier_bias: Optional[bool] = False, + classifier_activation: Optional[str] = "gelu", + use_cache: Optional[bool] = True, + local_attention: Optional[int] = 128, + global_attn_every_n_layers: Optional[int] = 3, + layer_types: Optional[list[str]] = None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): super().__init__( @@ -195,7 +195,6 @@ def __init__( self.initializer_cutoff_factor = initializer_cutoff_factor self.norm_eps = norm_eps self.norm_bias = norm_bias - self.global_rope_theta = global_rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -208,7 +207,9 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - self.local_rope_theta = local_rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -223,6 +224,14 @@ def __init__( else: self.layer_types.append("full_attention") + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "global_rope_theta", 160_000.0) + rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) + standardize_rope_params( + self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + ) + rope_config_validation(self) + # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 @@ -365,23 +374,15 @@ def __init__(self, config: ModernBertDecoderConfig, layer_idx: Optional[int] = N def forward( self, hidden_states: torch.Tensor, - position_embeddings_global: torch.Tensor, - position_embeddings_local: torch.Tensor, + position_embeddings: torch.Tensor = None, attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, - use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states hidden_states = self.attn_norm(hidden_states) - # apply global RoPE to non-sliding layer only - if self.attn.is_sliding: - position_embeddings = position_embeddings_local - else: - position_embeddings = position_embeddings_global - # Self Attention attn_outputs = self.attn( hidden_states=hidden_states, @@ -485,11 +486,9 @@ def __init__(self, config: ModernBertDecoderConfig): [ModernBertDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.final_norm = nn.LayerNorm(config.hidden_size, eps=config.norm_eps, bias=config.norm_bias) + self.rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) self.gradient_checkpointing = False - self.global_rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) - self.local_rotary_emb = ModernBertDecoderRotaryEmbedding(config=config) - self.post_init() def get_input_embeddings(self): @@ -555,19 +554,19 @@ def forward( "sliding_attention": create_sliding_window_causal_mask(**mask_kwargs), } - # create position embeddings to be shared across the decoder layers - position_embeddings_global = self.global_rotary_emb(hidden_states, position_ids) - position_embeddings_local = self.local_rotary_emb(hidden_states, position_ids) + position_embeddings = {} + for layer_type in self.config.layer_types: + position_embeddings[layer_type] = self.rotary_emb(hidden_states, position_ids, layer_type) for idx, decoder_layer in enumerate(self.layers): hidden_states = decoder_layer( hidden_states, - position_embeddings_global=position_embeddings_global, - position_embeddings_local=position_embeddings_local, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings[decoder_layer.attention_type], past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_ids=position_ids, **kwargs, ) diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 2d6b026cf8a8..5237cd4e3d8c 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -18,8 +18,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class MoonshineConfig(PreTrainedConfig): @@ -81,45 +83,10 @@ class MoonshineConfig(PreTrainedConfig): the task. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 0.9): Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): @@ -158,30 +125,29 @@ class MoonshineConfig(PreTrainedConfig): def __init__( self, - vocab_size=32768, - hidden_size=288, - intermediate_size=1152, - encoder_num_hidden_layers=6, - decoder_num_hidden_layers=6, - encoder_num_attention_heads=8, - decoder_num_attention_heads=8, - encoder_num_key_value_heads=None, - decoder_num_key_value_heads=None, - pad_head_dim_to_multiple_of=None, - encoder_hidden_act="gelu", - decoder_hidden_act="silu", - max_position_embeddings=512, - initializer_range=0.02, - decoder_start_token_id=1, - use_cache=True, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=0.9, - is_encoder_decoder=True, - attention_bias=False, - attention_dropout=0.0, - bos_token_id=1, - eos_token_id=2, + vocab_size: Optional[int] = 32768, + hidden_size: Optional[int] = 288, + intermediate_size: Optional[int] = 1152, + encoder_num_hidden_layers: Optional[int] = 6, + decoder_num_hidden_layers: Optional[int] = 6, + encoder_num_attention_heads: Optional[int] = 8, + decoder_num_attention_heads: Optional[int] = 8, + encoder_num_key_value_heads: Optional[int] = None, + decoder_num_key_value_heads: Optional[int] = None, + pad_head_dim_to_multiple_of: Optional[int] = None, + encoder_hidden_act: Optional[str] = "gelu", + decoder_hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 512, + initializer_range: Optional[float] = 0.02, + decoder_start_token_id: Optional[int] = 1, + use_cache: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 0.9, + is_encoder_decoder: Optional[bool] = True, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, **kwargs, ): self.vocab_size = vocab_size @@ -208,16 +174,18 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) - super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 894cdf45ea6e..1e28fa7a021a 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -79,6 +79,73 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states +class MoonshineRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: MoonshineConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MoonshineConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -294,42 +361,6 @@ def forward( return attn_output, attn_weights -class MoonshineRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: MoonshineConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - class MoonshineEncoderLayer(GradientCheckpointingLayer): def __init__(self, config: MoonshineConfig, layer_idx: int): super().__init__() @@ -355,7 +386,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -502,12 +533,12 @@ def __init__(self, config: MoonshineConfig): self.conv2 = nn.Conv1d(embed_dim, 2 * embed_dim, kernel_size=7, stride=3) self.conv3 = nn.Conv1d(2 * embed_dim, embed_dim, kernel_size=3, stride=2) self.groupnorm = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=1e-5) - self.rotary_emb = MoonshineRotaryEmbedding(config=config) self.layers = nn.ModuleList( [MoonshineEncoderLayer(config, idx) for idx in range(config.encoder_num_hidden_layers)] ) self.layer_norm = nn.LayerNorm(embed_dim, bias=False) + self.rotary_emb = MoonshineRotaryEmbedding(config=config) self.gradient_checkpointing = False self.post_init() @@ -559,7 +590,7 @@ def forward( attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) position_ids = torch.arange(0, hidden_states.shape[1], device=hidden_states.device).unsqueeze(0) - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for encoder_layer in self.layers: hidden_states = encoder_layer( @@ -654,7 +685,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) if encoder_attention_mask is not None: mask_len = encoder_hidden_states.shape[-2] diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 818468f07ef2..2c318b61e657 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -106,45 +106,10 @@ class MoonshineConfig(PreTrainedConfig): the task. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 0.9): Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): @@ -183,30 +148,29 @@ class MoonshineConfig(PreTrainedConfig): def __init__( self, - vocab_size=32768, - hidden_size=288, - intermediate_size=1152, - encoder_num_hidden_layers=6, - decoder_num_hidden_layers=6, - encoder_num_attention_heads=8, - decoder_num_attention_heads=8, - encoder_num_key_value_heads=None, - decoder_num_key_value_heads=None, - pad_head_dim_to_multiple_of=None, - encoder_hidden_act="gelu", - decoder_hidden_act="silu", - max_position_embeddings=512, - initializer_range=0.02, - decoder_start_token_id=1, - use_cache=True, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=0.9, - is_encoder_decoder=True, - attention_bias=False, - attention_dropout=0.0, - bos_token_id=1, - eos_token_id=2, + vocab_size: Optional[int] = 32768, + hidden_size: Optional[int] = 288, + intermediate_size: Optional[int] = 1152, + encoder_num_hidden_layers: Optional[int] = 6, + decoder_num_hidden_layers: Optional[int] = 6, + encoder_num_attention_heads: Optional[int] = 8, + decoder_num_attention_heads: Optional[int] = 8, + encoder_num_key_value_heads: Optional[int] = None, + decoder_num_key_value_heads: Optional[int] = None, + pad_head_dim_to_multiple_of: Optional[int] = None, + encoder_hidden_act: Optional[str] = "gelu", + decoder_hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 512, + initializer_range: Optional[float] = 0.02, + decoder_start_token_id: Optional[int] = 1, + use_cache: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 0.9, + is_encoder_decoder: Optional[bool] = True, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, **kwargs, ): self.vocab_size = vocab_size @@ -233,16 +197,18 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) - super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, @@ -283,6 +249,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return hidden_states +class MoonshineRotaryEmbedding(GlmRotaryEmbedding): + pass + + class MoonshineAttention(GlmAttention): def __init__( self, @@ -393,10 +363,6 @@ def forward( return attn_output, attn_weights -class MoonshineRotaryEmbedding(GlmRotaryEmbedding): - pass - - class MoonshineEncoderLayer(LlamaDecoderLayer): def __init__(self, config: MoonshineConfig, layer_idx: int): super().__init__(config, layer_idx) @@ -535,12 +501,12 @@ def __init__(self, config: MoonshineConfig): self.conv2 = nn.Conv1d(embed_dim, 2 * embed_dim, kernel_size=7, stride=3) self.conv3 = nn.Conv1d(2 * embed_dim, embed_dim, kernel_size=3, stride=2) self.groupnorm = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=1e-5) - self.rotary_emb = MoonshineRotaryEmbedding(config=config) self.layers = nn.ModuleList( [MoonshineEncoderLayer(config, idx) for idx in range(config.encoder_num_hidden_layers)] ) self.layer_norm = nn.LayerNorm(embed_dim, bias=False) + self.rotary_emb = MoonshineRotaryEmbedding(config=config) self.gradient_checkpointing = False self.post_init() @@ -592,7 +558,7 @@ def forward( attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) position_ids = torch.arange(0, hidden_states.shape[1], device=hidden_states.device).unsqueeze(0) - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for encoder_layer in self.layers: hidden_states = encoder_layer( @@ -677,7 +643,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) if encoder_attention_mask is not None: mask_len = encoder_hidden_states.shape[-2] diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index d5beb71400dd..8d2e2aef339a 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -14,7 +14,10 @@ # limitations under the License. """Moshi model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -179,8 +182,10 @@ class MoshiConfig(PreTrainedConfig): max_position_embeddings (`int`, *optional*, defaults to 3000): The maximum sequence length that this model might ever be used with. Typically, set this to something large just in case (e.g., 512 or 1024 or 2048). - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): The non-linear activation function (function or string) in the decoder. head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`): @@ -240,24 +245,24 @@ class MoshiConfig(PreTrainedConfig): def __init__( self, - vocab_size=32000, - hidden_size=4096, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - audio_vocab_size=None, - max_position_embeddings=3000, - rope_theta=10000.0, - hidden_act="silu", - head_dim=None, - initializer_range=0.02, - use_cache=True, - sliding_window=3000, - attention_dropout=0.0, - ffn_dim=22528, - rms_norm_eps=1e-8, - num_codebooks=8, - tie_word_embeddings=False, + vocab_size: Optional[int] = 32000, + hidden_size: Optional[int] = 4096, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + audio_vocab_size: Optional[int] = None, + max_position_embeddings: Optional[int] = 3000, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + hidden_act: Optional[str] = "silu", + head_dim: Optional[int] = None, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + sliding_window: Optional[int] = 3000, + attention_dropout: Optional[float] = 0.0, + ffn_dim: Optional[int] = 22528, + rms_norm_eps: Optional[int] = 1e-8, + num_codebooks: Optional[int] = 8, + tie_word_embeddings: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -266,7 +271,6 @@ def __init__( self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads self.max_position_embeddings = max_position_embeddings - self.rope_theta = rope_theta self.hidden_act = hidden_act self.head_dim = head_dim or hidden_size // num_attention_heads self.initializer_range = initializer_range @@ -278,6 +282,14 @@ def __init__( self.ffn_dim = ffn_dim self.rms_norm_eps = rms_norm_eps self.num_codebooks = num_codebooks + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/moshi/modeling_moshi.py b/src/transformers/models/moshi/modeling_moshi.py index 3d496c21ae8f..964fbd46126a 100644 --- a/src/transformers/models/moshi/modeling_moshi.py +++ b/src/transformers/models/moshi/modeling_moshi.py @@ -15,6 +15,7 @@ """PyTorch Moshi model.""" import math +from collections.abc import Callable from dataclasses import dataclass from typing import Any, Optional, Union @@ -268,26 +269,55 @@ def forward(self, x, layer_idx=None): return self.linear(x) -# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Moshi +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Moshi class MoshiRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: MoshiConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[MoshiConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -425,7 +455,6 @@ def __init__(self, config: MoshiConfig, layer_idx: Optional[int] = None, use_fle # rotary embeddings are not used in the depth decoder self.rotary_emb = None if use_rope: - self.rope_theta = config.rope_theta self.rotary_emb = MoshiRotaryEmbedding(config) def forward( diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index a728ead38b7a..084a674fc345 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -15,8 +15,10 @@ # limitations under the License. """Nemotron model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -74,8 +76,10 @@ class NemotronConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. @@ -102,27 +106,27 @@ class NemotronConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=6144, - intermediate_size=24576, - num_hidden_layers=32, - num_attention_heads=48, - head_dim=None, - num_key_value_heads=None, - hidden_act="relu2", - max_position_embeddings=4096, - initializer_range=0.0134, - norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=2, - eos_token_id=3, - tie_word_embeddings=False, - rope_theta=10000.0, - partial_rotary_factor=0.5, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 6144, + intermediate_size: Optional[int] = 24576, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 48, + head_dim: Optional[int] = None, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.0134, + norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 2, + eos_token_id: Optional[int] = 3, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 0.5, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -137,12 +141,18 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.partial_rotary_factor = partial_rotary_factor - rope_config_validation(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 8a4aac07c470..1c8c7eca861f 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -16,6 +16,7 @@ """PyTorch Nemotron model.""" import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -37,7 +38,10 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging from .configuration_nemotron import NemotronConfig @@ -90,24 +94,54 @@ def forward(self, input: Tensor) -> Tensor: class NemotronRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - # Ignore copy - def __init__( - self, - config: NemotronConfig, - device=None, - ): + def __init__(self, config: NemotronConfig, device=None): super().__init__() - - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + # Ignore copy + def compute_default_rope_parameters( + config: Optional[NemotronConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -214,9 +248,10 @@ def __init__(self, config: NemotronConfig, layer_idx: Optional[int] = None): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings - self.rope_theta = config.rope_theta + self.partial_rotary_factor = config.partial_rotary_factor self.is_causal = True + self.rotary_emb = NemotronRotaryEmbedding(config=config) self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) @@ -244,8 +279,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if position_embeddings is not None: - cos, sin = position_embeddings + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: @@ -328,8 +362,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if position_embeddings is not None: - cos, sin = position_embeddings + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: @@ -447,8 +480,7 @@ def forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - if position_embeddings is not None: - cos, sin = position_embeddings + cos, sin = position_embeddings query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_values is not None: @@ -521,7 +553,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -682,9 +714,7 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 817e6d38df11..6a1fb4f96526 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -19,7 +19,10 @@ # limitations under the License. """OLMo model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -73,16 +76,10 @@ class OlmoConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -123,25 +120,24 @@ class OlmoConfig(PreTrainedConfig): def __init__( self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - clip_qkv=None, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[int] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + clip_qkv: Optional[bool] = None, **kwargs, ): self.vocab_size = vocab_size @@ -159,12 +155,17 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.clip_qkv = clip_qkv + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, @@ -174,25 +175,5 @@ def __init__( **kwargs, ) - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - __all__ = ["OlmoConfig"] diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py index e58840760cb5..6a3432c31d18 100644 --- a/src/transformers/models/olmo/modeling_olmo.py +++ b/src/transformers/models/olmo/modeling_olmo.py @@ -4,6 +4,26 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_olmo.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections.abc import Callable from typing import Optional, Union @@ -55,6 +75,70 @@ def forward(self, x): return down_proj +class OlmoRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: OlmoConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[OlmoConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + return cos, sin + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -161,6 +245,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -225,7 +310,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -251,41 +336,6 @@ def forward( return hidden_states -class OlmoRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: OlmoConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - return cos, sin - - @auto_docstring class OlmoPreTrainedModel(PreTrainedModel): config: OlmoConfig @@ -364,16 +414,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/olmo/modular_olmo.py b/src/transformers/models/olmo/modular_olmo.py index ef4f0c62b26c..4f2796837cb5 100644 --- a/src/transformers/models/olmo/modular_olmo.py +++ b/src/transformers/models/olmo/modular_olmo.py @@ -1,3 +1,23 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections.abc import Callable from typing import Optional @@ -6,6 +26,7 @@ import torch.nn.functional as F from ...cache_utils import Cache +from ...modeling_rope_utils import dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...utils import logging from ..llama.modeling_llama import ( @@ -46,6 +67,24 @@ def __init__(self, config): self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) +# This is identical to LlamaRotaryEmbedding except the output cos and sin are returned +# as float32 rather than the input type. +class OlmoRotaryEmbedding(LlamaRotaryEmbedding): + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + return cos, sin + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. @@ -82,6 +121,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -136,22 +176,6 @@ def __init__(self, config: OlmoConfig, layer_idx: int): self.self_attn = OlmoAttention(config=config, layer_idx=layer_idx) -# This is identical to LlamaRotaryEmbedding except the output cos and sin are returned -# as float32 rather than the input type. -class OlmoRotaryEmbedding(LlamaRotaryEmbedding): - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - return cos, sin - - class OlmoModel(LlamaModel): def __init__(self, config: OlmoConfig): super().__init__(config) diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 64a11b4339bd..2b4af4c7523c 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -4,8 +4,30 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_olmo2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Olmo2Config(PreTrainedConfig): @@ -55,16 +77,10 @@ class Olmo2Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -105,25 +121,24 @@ class Olmo2Config(PreTrainedConfig): def __init__( self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rms_norm_eps: Optional[int] = 1e-5, **kwargs, ): self.vocab_size = vocab_size @@ -141,11 +156,16 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__( pad_token_id=pad_token_id, @@ -157,25 +177,5 @@ def __init__( self.rms_norm_eps = rms_norm_eps - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: - raise ValueError( - f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_factor = self.rope_scaling.get("factor", None) - if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" - ) - if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: - raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") - __all__ = ["Olmo2Config"] diff --git a/src/transformers/models/olmo2/modeling_olmo2.py b/src/transformers/models/olmo2/modeling_olmo2.py index 70797b14bfc2..7315661282c9 100644 --- a/src/transformers/models/olmo2/modeling_olmo2.py +++ b/src/transformers/models/olmo2/modeling_olmo2.py @@ -4,6 +4,26 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_olmo2.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections.abc import Callable from typing import Optional, Union @@ -48,6 +68,70 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" +class Olmo2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Olmo2Config, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Olmo2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + return cos, sin + + def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, @@ -156,6 +240,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -231,7 +316,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -256,41 +341,6 @@ def forward( return hidden_states -class Olmo2RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Olmo2Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - return cos, sin - - @auto_docstring class Olmo2PreTrainedModel(PreTrainedModel): config: Olmo2Config @@ -369,16 +419,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py index 44b0f0ade07a..74eddd2d5af4 100644 --- a/src/transformers/models/olmo2/modular_olmo2.py +++ b/src/transformers/models/olmo2/modular_olmo2.py @@ -1,3 +1,23 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from collections.abc import Callable from typing import Optional @@ -7,6 +27,7 @@ from transformers.utils.generic import TransformersKwargs from ...cache_utils import Cache +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -72,16 +93,10 @@ class Olmo2Config(OlmoConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -121,25 +136,24 @@ class Olmo2Config(OlmoConfig): def __init__( self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rms_norm_eps: Optional[int] = 1e-5, **kwargs, ): super().__init__( @@ -157,8 +171,7 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, **kwargs, @@ -179,6 +192,10 @@ def forward(self, hidden_states): return (self.weight * hidden_states).to(input_dtype) +class Olmo2RotaryEmbedding(OlmoRotaryEmbedding): + pass + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -202,6 +219,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -262,7 +280,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -287,10 +305,6 @@ def forward( return hidden_states -class Olmo2RotaryEmbedding(OlmoRotaryEmbedding): - pass - - class Olmo2PreTrainedModel(LlamaPreTrainedModel): pass diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index e8dbe593dde9..08762d09ff61 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -19,8 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Olmo3Config(PreTrainedConfig): @@ -70,45 +72,10 @@ class Olmo3Config(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -154,29 +121,35 @@ class Olmo3Config(PreTrainedConfig): def __init__( self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, - sliding_window=4096, - layer_types=None, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rms_norm_eps: Optional[float] = 1e-5, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, **kwargs, ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -192,21 +165,12 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self._rope_scaling_validation() self.attention_bias = attention_bias self.attention_dropout = attention_dropout - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.rms_norm_eps = rms_norm_eps + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.sliding_window = sliding_window self.layer_types = layer_types @@ -214,12 +178,11 @@ def __init__( self.layer_types = [ "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types) + layer_type_validation(self.layer_types, self.num_hidden_layers) - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) diff --git a/src/transformers/models/olmo3/modeling_olmo3.py b/src/transformers/models/olmo3/modeling_olmo3.py index 5286e734f13a..2888f787399b 100644 --- a/src/transformers/models/olmo3/modeling_olmo3.py +++ b/src/transformers/models/olmo3/modeling_olmo3.py @@ -250,7 +250,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -278,26 +278,51 @@ def forward( class Olmo3RotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config: Olmo3Config, device=None, rope_type: Optional[str] = None): + def __init__(self, config: Olmo3Config, device=None): super().__init__() - if rope_type is not None: - self.rope_type = rope_type - elif hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - # BC: "rope_type" was originally "type" - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - assert self.rope_type is not None - self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Olmo3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -311,7 +336,8 @@ def forward(self, x, position_ids): emb = torch.cat((freqs, freqs), dim=-1) cos = emb.cos() * self.attention_scaling sin = emb.sin() * self.attention_scaling - return cos, sin + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @auto_docstring @@ -345,13 +371,8 @@ def __init__(self, config: Olmo3Config): [Olmo3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = Olmo3RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = Olmo3RotaryEmbedding(config=config) self.gradient_checkpointing = False - self.rotary_embs = nn.ModuleDict( - { - "sliding_attention": Olmo3RotaryEmbedding(config=config, rope_type="default"), - "full_attention": Olmo3RotaryEmbedding(config=config), - } - ) # Initialize weights and apply final processing self.post_init() @@ -405,10 +426,7 @@ def forward( } hidden_states = inputs_embeds - position_embeddings_mapping = { - "sliding_attention": self.rotary_embs["sliding_attention"](hidden_states, position_ids), - "full_attention": self.rotary_embs["full_attention"](hidden_states, position_ids), - } + position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( @@ -417,7 +435,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings_mapping[decoder_layer.self_attn.attention_type], + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index cdd54fd00b1f..ab1b63752721 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -22,13 +22,13 @@ from transformers.utils.generic import TransformersKwargs from ...cache_utils import Cache, DynamicCache -from ...configuration_utils import layer_type_validation +from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack -from ..olmo2.configuration_olmo2 import Olmo2Config +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..olmo2.modeling_olmo2 import ( Olmo2Attention, Olmo2DecoderLayer, @@ -36,13 +36,12 @@ Olmo2Model, Olmo2PreTrainedModel, Olmo2RMSNorm, - Olmo2RotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) -class Olmo3Config(Olmo2Config): +class Olmo3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -89,45 +88,10 @@ class Olmo3Config(Olmo2Config): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -155,6 +119,7 @@ class Olmo3Config(Olmo2Config): """ model_type = "olmo3" + keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -172,51 +137,56 @@ class Olmo3Config(Olmo2Config): def __init__( self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, - sliding_window=4096, - layer_types=None, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + rms_norm_eps: Optional[float] = 1e-5, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, **kwargs, ): super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - rms_norm_eps=rms_norm_eps, **kwargs, ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.rms_norm_eps = rms_norm_eps + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.sliding_window = sliding_window self.layer_types = layer_types @@ -224,12 +194,11 @@ def __init__( self.layer_types = [ "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" for i in range(self.num_hidden_layers) ] - layer_type_validation(self.layer_types) + layer_type_validation(self.layer_types, self.num_hidden_layers) - def _rope_scaling_validation(self): - """ - Validate the `rope_scaling` configuration. - """ + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) @@ -299,29 +268,8 @@ class Olmo3DecoderLayer(Olmo2DecoderLayer): pass -# OLMo 3 RoPE is identical to OLMo 2 RoPE, except: -# - RoPE scaling is not applied to sliding window attention layers. -class Olmo3RotaryEmbedding(Olmo2RotaryEmbedding): - def __init__(self, config: Olmo3Config, device=None, rope_type: Optional[str] = None): - nn.Module.__init__(self) - if rope_type is not None: - self.rope_type = rope_type - elif hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - # BC: "rope_type" was originally "type" - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - assert self.rope_type is not None - - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq +class Olmo3RotaryEmbedding(Gemma2RotaryEmbedding): + pass class Olmo3PreTrainedModel(Olmo2PreTrainedModel): @@ -338,13 +286,7 @@ def __init__(self, config: Olmo3Config): self.layers = nn.ModuleList( [Olmo3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.rotary_embs = nn.ModuleDict( - { - "sliding_attention": Olmo3RotaryEmbedding(config=config, rope_type="default"), - "full_attention": Olmo3RotaryEmbedding(config=config), - } - ) - del self.rotary_emb + self.rotary_emb = Olmo3RotaryEmbedding(config=config) def forward( self, @@ -393,10 +335,7 @@ def forward( } hidden_states = inputs_embeds - position_embeddings_mapping = { - "sliding_attention": self.rotary_embs["sliding_attention"](hidden_states, position_ids), - "full_attention": self.rotary_embs["full_attention"](hidden_states, position_ids), - } + position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( @@ -405,7 +344,7 @@ def forward( position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings_mapping[decoder_layer.self_attn.attention_type], + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 99606d480f7d..5dae49098a29 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -11,8 +11,10 @@ # limitations under the License. """OLMoE model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class OlmoeConfig(PreTrainedConfig): @@ -64,16 +66,10 @@ class OlmoeConfig(PreTrainedConfig): End of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling - strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is - `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update - `max_position_embeddings` to the expected new maximum. See the following thread for more information on how - these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an - experimental feature, subject to breaking API changes in future versions. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -111,31 +107,30 @@ class OlmoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=50304, - hidden_size=2048, - intermediate_size=2048, - num_hidden_layers=16, - num_attention_heads=16, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - clip_qkv=None, - num_experts_per_tok=8, - num_experts=64, - output_router_logits=False, - router_aux_loss_coef=0.01, - norm_topk_prob=False, + vocab_size: Optional[int] = 50304, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 16, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = None, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = None, + eos_token_id: Optional[int] = 50279, + tie_word_embeddings: Optional[int] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + clip_qkv: Optional[bool] = None, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 64, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.01, + norm_topk_prob: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -154,8 +149,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.clip_qkv = clip_qkv @@ -164,10 +157,13 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/olmoe/modeling_olmoe.py b/src/transformers/models/olmoe/modeling_olmoe.py index 0284d085aa48..3403b2e59667 100644 --- a/src/transformers/models/olmoe/modeling_olmoe.py +++ b/src/transformers/models/olmoe/modeling_olmoe.py @@ -63,20 +63,49 @@ class OlmoeRotaryEmbedding(nn.Module): def __init__(self, config: OlmoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[OlmoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -346,7 +375,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states diff --git a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py index 7fe5ae0b6205..0d079f76a797 100644 --- a/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py +++ b/src/transformers/models/perception_lm/convert_perception_lm_weights_to_hf.py @@ -359,7 +359,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): use_scaled_rope = model_params["use_scaled_rope"] if use_scaled_rope: - rope_scaling = { + rope_parameters = { "factor": model_params["rope_scale_factor"] * 1.0, "low_freq_factor": model_params.get("low_freq_factor", 1.0) * 1.0, "high_freq_factor": model_params.get("high_freq_factor", 4.0) * 1.0, @@ -367,7 +367,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): "rope_type": "llama3", } else: - rope_scaling = None + rope_parameters = None text_config = LlamaConfig( hidden_size=dim, @@ -378,7 +378,7 @@ def permute(w, n_heads, dim1=dim, dim2=dim): num_key_value_heads=num_key_value_heads, vocab_size=len(tokenizer), rope_theta=base, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, max_position_embeddings=max_position_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 5e80cd704ff3..5c2452526635 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -14,8 +14,10 @@ # limitations under the License. """Persimmon model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -58,45 +60,10 @@ class PersimmonConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings(`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 25000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. qk_layernorm (`bool`, *optional*, default to `True`): Whether or not to normalize the Queries and Keys after projecting the hidden states hidden_dropout (`float`, *optional*, default to 0.0): @@ -120,26 +87,25 @@ class PersimmonConfig(PreTrainedConfig): def __init__( self, - vocab_size=262144, - hidden_size=4096, - intermediate_size=16384, - num_hidden_layers=36, - num_attention_heads=64, - hidden_act="relu2", - max_position_embeddings=16384, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=25000.0, - rope_scaling=None, - qk_layernorm=True, - hidden_dropout=0.0, - attention_dropout=0.0, - partial_rotary_factor=0.5, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, + vocab_size: Optional[int] = 262144, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 16384, + num_hidden_layers: Optional[int] = 36, + num_attention_heads: Optional[int] = 64, + hidden_act: Optional[str] = "relu2", + max_position_embeddings: Optional[int] = 16384, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + qk_layernorm: Optional[bool] = True, + hidden_dropout: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + partial_rotary_factor: Optional[float] = 0.5, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, **kwargs, ): self.vocab_size = vocab_size @@ -152,16 +118,17 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.partial_rotary_factor = partial_rotary_factor + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 25000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index b67bb3341008..205d5b1fc1d7 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -39,7 +39,10 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging @@ -61,20 +64,52 @@ class PersimmonRotaryEmbedding(nn.Module): def __init__(self, config: PersimmonConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + # Ignore copy + def compute_default_rope_parameters( + config: Optional[PersimmonConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -183,7 +218,7 @@ def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None): self.hidden_size = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads - self.rope_theta = config.rope_theta + self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) self.is_causal = True @@ -205,7 +240,6 @@ def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None): config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True ) self.attention_dropout = nn.Dropout(config.attention_dropout) - self.rotary_emb = PersimmonRotaryEmbedding(config=self.config) def _split_heads(self, fused_qkv: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ @@ -232,7 +266,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -253,8 +287,6 @@ def forward( key_states = key_states.transpose(1, 2) cos, sin = position_embeddings - - # Partial rotary embedding query_rot, query_pass = ( query_states[..., : self.rotary_ndims], query_states[..., self.rotary_ndims :], @@ -323,7 +355,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -431,9 +463,7 @@ def __init__(self, config: PersimmonConfig): [PersimmonDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - - self.rotary_emb = PersimmonRotaryEmbedding(config=config) - + self.rotary_emb = PersimmonRotaryEmbedding(config=self.config) self.gradient_checkpointing = False # Initialize weights and apply final processing self.post_init() @@ -488,9 +518,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 8457dd2709e7..427b453db981 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -15,8 +15,10 @@ """Phi model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -73,45 +75,10 @@ class PhiConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. qk_layernorm (`bool`, *optional*, defaults to `False`): @@ -155,27 +122,26 @@ class PhiConfig(PreTrainedConfig): def __init__( self, - vocab_size=51200, - hidden_size=2048, - intermediate_size=8192, - num_hidden_layers=24, - num_attention_heads=32, - num_key_value_heads=None, - resid_pdrop=0.0, - embd_pdrop=0.0, - attention_dropout=0.0, - hidden_act="gelu_new", - max_position_embeddings=2048, - initializer_range=0.02, - layer_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=0.5, - qk_layernorm=False, - bos_token_id=1, - eos_token_id=2, + vocab_size: Optional[int] = 51200, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + resid_pdrop: Optional[float] = 0.0, + embd_pdrop: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "gelu_new", + max_position_embeddings: Optional[int] = 2048, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 0.5, + qk_layernorm: Optional[bool] = False, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, **kwargs, ): self.vocab_size = vocab_size @@ -196,14 +162,15 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor self.qk_layernorm = qk_layernorm + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index af71a136cbdc..3fb8de6e32e3 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -31,6 +31,73 @@ logger = logging.get_logger(__name__) +class PhiRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: PhiConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[PhiConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -136,6 +203,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -223,7 +291,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -254,42 +322,6 @@ def forward( return outputs -class PhiRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: PhiConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class PhiPreTrainedModel(PreTrainedModel): config: PhiConfig @@ -384,9 +416,7 @@ def forward( inputs_embeds = self.embed_dropout(inputs_embeds) # diff with Llama hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index c54dac28fcd0..3ecc9ba9d4f7 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -33,6 +33,40 @@ _CONFIG_FOR_DOC = "PhiConfig" +class PhiRotaryEmbedding(LlamaRotaryEmbedding): + @staticmethod + def compute_default_rope_parameters( + config: Optional[PhiConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + class PhiAttention(LlamaAttention): def __init__(self, config: PhiConfig, layer_idx: int): super().__init__(config, layer_idx) @@ -58,6 +92,7 @@ def forward( attention_mask: Optional[torch.Tensor], past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: input_shape = hidden_states.shape[:-1] @@ -134,7 +169,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -165,10 +200,6 @@ def forward( return outputs -class PhiRotaryEmbedding(LlamaRotaryEmbedding): - pass - - class PhiModel(LlamaModel): def __init__(self, config: PhiConfig): super().__init__(config) @@ -233,9 +264,7 @@ def forward( inputs_embeds = self.embed_dropout(inputs_embeds) # diff with Llama hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 30429a673f48..ed096dd8a319 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -15,7 +15,10 @@ """Phi-3 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -74,13 +77,10 @@ class Phi3Config(PreTrainedConfig): relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and - the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size - divided by the number of attention heads divided by 2. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 1.0): Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 1): @@ -123,29 +123,28 @@ class Phi3Config(PreTrainedConfig): def __init__( self, - vocab_size=32064, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - resid_pdrop=0.0, - embd_pdrop=0.0, - attention_dropout=0.0, - hidden_act="silu", - max_position_embeddings=4096, - original_max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=1.0, - bos_token_id=1, - eos_token_id=32000, - pad_token_id=32000, - sliding_window=None, + vocab_size: Optional[int] = 32064, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + resid_pdrop: Optional[float] = 0.0, + embd_pdrop: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + original_max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 1.0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 32000, + pad_token_id: Optional[int] = 32000, + sliding_window: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -167,11 +166,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_adjustment() - self._rope_scaling_validation() + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + self._rope_parameters_adjustment() + self._rope_parameters_validation() self.sliding_window = sliding_window super().__init__( @@ -182,59 +187,54 @@ def __init__( **kwargs, ) - def _rope_scaling_adjustment(self): + def _rope_parameters_adjustment(self): """ - Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + Adjust the `type` of the `rope_parameters` configuration for backward compatibility. """ - if self.rope_scaling is None: - return - - rope_scaling_type = self.rope_scaling.get("type", None) + rope_parameters_type = self.rope_parameters.get("rope_type", None) # For backward compatibility if previous version used "su" or "yarn" - if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: - self.rope_scaling["type"] = "longrope" + if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: + self.rope_parameters["rope_type"] = "longrope" - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: - raise ValueError( - "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) - rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) - if rope_scaling_type is None or rope_scaling_type != "longrope": - raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") - if not ( - isinstance(rope_scaling_short_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) - ): - raise ValueError( - f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" - ) + if not isinstance(self.rope_parameters, dict): + raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") + rope_parameters_type = self.rope_parameters.get("rope_type", None) + rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) + rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) - if not len(rope_scaling_short_factor) == rotary_ndims // 2: - raise ValueError( - f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}" - ) - if not ( - isinstance(rope_scaling_long_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) - ): - raise ValueError( - f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" - ) - if not len(rope_scaling_long_factor) == rotary_ndims // 2: - raise ValueError( - f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}" - ) + if rope_parameters_type not in ["default", "longrope"]: + raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") + + if rope_parameters_short_factor is not None: + if not ( + isinstance(rope_parameters_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_parameters_short_factor) + ): + raise ValueError( + f"`rope_parameters`'s short_factor field must be a list of numbers, got {rope_parameters_short_factor}" + ) + if not len(rope_parameters_short_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_parameters`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_parameters_short_factor)}" + ) + + if rope_parameters_long_factor is not None: + if not ( + isinstance(rope_parameters_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_parameters_long_factor) + ): + raise ValueError( + f"`rope_parameters`'s long_factor field must be a list of numbers, got {rope_parameters_long_factor}" + ) + if not len(rope_parameters_long_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_parameters`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_parameters_long_factor)}" + ) __all__ = ["Phi3Config"] diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 24a8463b70c8..d1ebf1ea99c0 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -65,6 +65,73 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: return self.down_proj(up_states) +class Phi3RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Phi3Config, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Phi3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -252,7 +319,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -297,42 +364,6 @@ class Phi3PreTrainedModel(PreTrainedModel): _version = "0.0.5" -class Phi3RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Phi3Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class Phi3Model(Phi3PreTrainedModel): def __init__(self, config: Phi3Config): @@ -393,7 +424,7 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( @@ -507,7 +538,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and hasattr(self.config, "original_max_position_embeddings") and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/phi3/modular_phi3.py b/src/transformers/models/phi3/modular_phi3.py index e1d793fb79e4..5b0d5f76d69c 100644 --- a/src/transformers/models/phi3/modular_phi3.py +++ b/src/transformers/models/phi3/modular_phi3.py @@ -37,6 +37,7 @@ eager_attention_forward, rotate_half, ) +from ..phi.modeling_phi import PhiRotaryEmbedding from .configuration_phi3 import Phi3Config @@ -64,6 +65,10 @@ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: return self.down_proj(up_states) +class Phi3RotaryEmbedding(PhiRotaryEmbedding): + pass + + def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. @@ -182,7 +187,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -231,7 +236,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and hasattr(self.config, "original_max_position_embeddings") and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index bed51d1639fc..c9ea706b2c4c 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -19,8 +19,10 @@ # limitations under the License. import math +from typing import Optional from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -290,13 +292,10 @@ class Phi4MultimodalConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and - the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size - divided by the number of attention heads divided by 2. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to `1.0`): Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): @@ -352,31 +351,30 @@ class Phi4MultimodalConfig(PreTrainedConfig): def __init__( self, - vocab_size=200064, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - resid_pdrop=0.0, - embd_pdrop=0.0, - attention_dropout=0.0, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=1, - bos_token_id=199999, - eos_token_id=[199999, 200020], - pad_token_id=199999, - original_max_position_embeddings=4096, - sliding_window=None, - vision_config=None, - audio_config=None, + vocab_size: Optional[int] = 200064, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + resid_pdrop: Optional[float] = 0.0, + embd_pdrop: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[int] = 1, + bos_token_id: Optional[int] = 199999, + eos_token_id: Optional[list[int]] = [199999, 200020], + pad_token_id: Optional[int] = 199999, + original_max_position_embeddings: Optional[int] = 4096, + sliding_window: Optional[int] = None, + vision_config: Optional[dict] = None, + audio_config: Optional[dict] = None, **kwargs, ): if isinstance(vision_config, dict): @@ -409,11 +407,17 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor - self._rope_scaling_adjustment() - self._rope_scaling_validation() + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + self._rope_parameters_adjustment() + self._rope_parameters_validation() self.sliding_window = sliding_window super().__init__( @@ -424,59 +428,54 @@ def __init__( **kwargs, ) - def _rope_scaling_adjustment(self): + def _rope_parameters_adjustment(self): """ - Adjust the `type` of the `rope_scaling` configuration for backward compatibility. + Adjust the `type` of the `rope_parameters` configuration for backward compatibility. """ - if self.rope_scaling is None: - return - - rope_scaling_type = self.rope_scaling.get("type", None) + rope_parameters_type = self.rope_parameters.get("rope_type", None) # For backward compatibility if previous version used "su" or "yarn" - if rope_scaling_type is not None and rope_scaling_type in ["su", "yarn"]: - self.rope_scaling["type"] = "longrope" + if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: + self.rope_parameters["rope_type"] = "longrope" - def _rope_scaling_validation(self): + def _rope_parameters_validation(self): """ - Validate the `rope_scaling` configuration. + Validate the `rope_parameters` configuration. """ - if self.rope_scaling is None: - return - - if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3: - raise ValueError( - "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, " - f"got {self.rope_scaling}" - ) - rope_scaling_type = self.rope_scaling.get("type", None) - rope_scaling_short_factor = self.rope_scaling.get("short_factor", None) - rope_scaling_long_factor = self.rope_scaling.get("long_factor", None) - if rope_scaling_type is None or rope_scaling_type != "longrope": - raise ValueError(f"`rope_scaling`'s type field must be one of ['longrope'], got {rope_scaling_type}") - if not ( - isinstance(rope_scaling_short_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor) - ): - raise ValueError( - f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}" - ) + if not isinstance(self.rope_parameters, dict): + raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") + rope_parameters_type = self.rope_parameters.get("rope_type", None) + rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) + rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) - if not len(rope_scaling_short_factor) == rotary_ndims // 2: - raise ValueError( - f"`rope_scaling`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_short_factor)}" - ) - if not ( - isinstance(rope_scaling_long_factor, list) - and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor) - ): - raise ValueError( - f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}" - ) - if not len(rope_scaling_long_factor) == rotary_ndims // 2: - raise ValueError( - f"`rope_scaling`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_scaling_long_factor)}" - ) + if rope_parameters_type not in ["default", "longrope"]: + raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") + + if rope_parameters_short_factor is not None: + if not ( + isinstance(rope_parameters_short_factor, list) + and all(isinstance(x, (int, float)) for x in rope_parameters_short_factor) + ): + raise ValueError( + f"`rope_parameters`'s short_factor field must be a list of numbers, got {rope_parameters_short_factor}" + ) + if not len(rope_parameters_short_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_parameters`'s short_factor field must have length {rotary_ndims // 2}, got {len(rope_parameters_short_factor)}" + ) + + if rope_parameters_long_factor is not None: + if not ( + isinstance(rope_parameters_long_factor, list) + and all(isinstance(x, (int, float)) for x in rope_parameters_long_factor) + ): + raise ValueError( + f"`rope_parameters`'s long_factor field must be a list of numbers, got {rope_parameters_long_factor}" + ) + if not len(rope_parameters_long_factor) == rotary_ndims // 2: + raise ValueError( + f"`rope_parameters`'s long_factor field must have length {rotary_ndims // 2}, got {len(rope_parameters_long_factor)}" + ) __all__ = ["Phi4MultimodalVisionConfig", "Phi4MultimodalAudioConfig", "Phi4MultimodalConfig"] diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index b32ebf694eca..5040dbb80734 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1392,7 +1392,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: residual = hidden_states @@ -1475,25 +1475,82 @@ def forward( return inputs_embeds +@auto_docstring +class Phi4MultimodalPreTrainedModel(PreTrainedModel): + config: Phi4MultimodalConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Phi4MultimodalDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Phi4MultimodalDecoderLayer, + "attentions": Phi4MultimodalAttention, + } + _version = "0.0.5" + + def _init_weights(self, module): + super()._init_weights(module) + if isinstance(module, Phi4MultimodalImageEmbedding): + module.global_img_feature_extensor.data.zero_() + module.sub_img_feature_extensor.data.zero_() + + class Phi4MultimodalRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: Phi4MultimodalConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Phi4MultimodalConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1511,32 +1568,6 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -@auto_docstring -class Phi4MultimodalPreTrainedModel(PreTrainedModel): - config: Phi4MultimodalConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Phi4MultimodalDecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": Phi4MultimodalDecoderLayer, - "attentions": Phi4MultimodalAttention, - } - _version = "0.0.5" - - def _init_weights(self, module): - super()._init_weights(module) - if isinstance(module, Phi4MultimodalImageEmbedding): - module.global_img_feature_extensor.data.zero_() - module.sub_img_feature_extensor.data.zero_() - - @auto_docstring class Phi4MultimodalModel(Phi4MultimodalPreTrainedModel): def __init__(self, config: Phi4MultimodalConfig): @@ -1633,9 +1664,8 @@ def forward( ) hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers: hidden_states = decoder_layer( hidden_states, @@ -1793,7 +1823,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and self.config.rope_parameters and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 508dc085ef9b..86e9c302d337 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -32,6 +32,7 @@ BaseModelOutputWithPooling, CausalLMOutputWithPast, ) +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import auto_docstring, logging @@ -43,7 +44,6 @@ Phi3Model, Phi3PreTrainedModel, Phi3RMSNorm, - Phi3RotaryEmbedding, ) from ..siglip.configuration_siglip import SiglipVisionConfig from ..siglip.modeling_siglip import ( @@ -328,13 +328,10 @@ class Phi4MultimodalConfig(Phi3Config): relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and - the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size - divided by the number of attention heads divided by 2. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to `1.0`): Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): @@ -376,31 +373,30 @@ class Phi4MultimodalConfig(Phi3Config): def __init__( self, - vocab_size=200064, - hidden_size=3072, - intermediate_size=8192, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - resid_pdrop=0.0, - embd_pdrop=0.0, - attention_dropout=0.0, - hidden_act="silu", - max_position_embeddings=131072, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=1, - bos_token_id=199999, - eos_token_id=[199999, 200020], - pad_token_id=199999, - original_max_position_embeddings=4096, - sliding_window=None, - vision_config=None, - audio_config=None, + vocab_size: Optional[int] = 200064, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 8192, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + resid_pdrop: Optional[float] = 0.0, + embd_pdrop: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 131072, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[int] = 1, + bos_token_id: Optional[int] = 199999, + eos_token_id: Optional[list[int]] = [199999, 200020], + pad_token_id: Optional[int] = 199999, + original_max_position_embeddings: Optional[int] = 4096, + sliding_window: Optional[int] = None, + vision_config: Optional[dict] = None, + audio_config: Optional[dict] = None, **kwargs, ): if isinstance(vision_config, dict): @@ -431,8 +427,7 @@ def __init__( rms_norm_eps=rms_norm_eps, use_cache=use_cache, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, - rope_scaling=rope_scaling, + rope_parameters=rope_parameters, partial_rotary_factor=partial_rotary_factor, bos_token_id=bos_token_id, eos_token_id=eos_token_id, @@ -1441,10 +1436,6 @@ def forward( return inputs_embeds -class Phi4MultimodalRotaryEmbedding(Phi3RotaryEmbedding): - pass - - class Phi4MultimodalPreTrainedModel(Phi3PreTrainedModel): def _init_weights(self, module): PreTrainedModel._init_weights(self, module) @@ -1546,9 +1537,8 @@ def forward( ) hidden_states = inputs_embeds + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers: hidden_states = decoder_layer( hidden_states, @@ -1701,7 +1691,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and self.config.rope_parameters and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 189be828906f..8af5508daf93 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -15,8 +15,10 @@ """PyTorch Phi-MoE model.""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -70,14 +72,10 @@ class PhimoeConfig(PreTrainedConfig): The id of the "end-of-sequence" token. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. - rope_scaling (`dict`, *optional*): - The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must - contain the following keys: `type`, `short_factor`, `long_factor`, `short_mscale`, `long_mscale` and - `original_max_position_embeddings`. The `type` must be `longrope`, the `short_mscale` and `long_scale` must - be numbers, the `short_factor` and `long_factor` must be lists of numbers with the same length as half of - the attention head size and the `original_max_position_embeddings` must be an integer. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `262144`. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -115,33 +113,32 @@ class PhimoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=32064, - hidden_size=4096, - intermediate_size=6400, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=4096 * 32, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=1e6, - rope_scaling=None, - sliding_window=None, - attention_dropout=0.0, - num_experts_per_tok=2, - num_local_experts=16, - output_router_logits=False, - router_aux_loss_coef=0.001, - router_jitter_noise=0.01, - input_jitter_noise=0.0, - attention_bias=False, - lm_head_bias=False, + vocab_size: Optional[int] = 32064, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 6400, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096 * 32, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = None, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[int] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + sliding_window: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + num_experts_per_tok: Optional[int] = 2, + num_local_experts: Optional[int] = 16, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + router_jitter_noise: Optional[float] = 0.01, + input_jitter_noise: Optional[float] = 0.0, + attention_bias: Optional[bool] = False, + lm_head_bias: Optional[bool] = False, **kwargs, ): self.vocab_size = vocab_size @@ -162,7 +159,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout self.num_experts_per_tok = num_experts_per_tok @@ -171,21 +167,27 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise self.input_jitter_noise = input_jitter_noise - - self.rope_scaling = rope_scaling - if isinstance(self.rope_scaling, dict): - if "rope_type" not in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling.get("type", None) - if "original_max_position_embeddings" in self.rope_scaling: - self.original_max_position_embeddings = self.rope_scaling["original_max_position_embeddings"] - rope_scaling_short_mscale = self.rope_scaling.get("short_mscale", None) - rope_scaling_long_mscale = self.rope_scaling.get("long_mscale", None) - if not isinstance(rope_scaling_short_mscale, (int, float)): + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + + if self.rope_parameters["rope_type"] != "default": + if "original_max_position_embeddings" in self.rope_parameters: + self.original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"] + rope_parameters_short_mscale = self.rope_parameters.get("short_mscale", None) + rope_parameters_long_mscale = self.rope_parameters.get("long_mscale", None) + if not isinstance(rope_parameters_short_mscale, (int, float)): + raise TypeError( + f"`rope_parameters`'s short_mscale field must be a number, got {rope_parameters_short_mscale}" + ) + if not isinstance(rope_parameters_long_mscale, (int, float)): raise TypeError( - f"`rope_scaling`'s short_mscale field must be a number, got {rope_scaling_short_mscale}" + f"`rope_parameters`'s long_mscale field must be a number, got {rope_parameters_long_mscale}" ) - if not isinstance(rope_scaling_long_mscale, (int, float)): - raise TypeError(f"`rope_scaling`'s long_mscale field must be a number, got {rope_scaling_long_mscale}") rope_config_validation(self) diff --git a/src/transformers/models/phimoe/modeling_phimoe.py b/src/transformers/models/phimoe/modeling_phimoe.py index ebe815ddcee4..5f974d51e8c4 100644 --- a/src/transformers/models/phimoe/modeling_phimoe.py +++ b/src/transformers/models/phimoe/modeling_phimoe.py @@ -33,7 +33,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple @@ -42,28 +42,68 @@ class PhimoeRotaryEmbedding(nn.Module): - def __init__( - self, - config: Optional[PhimoeConfig] = None, - ): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: PhimoeConfig, device=None): super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings self.config = config - if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - self.short_mscale = config.rope_scaling.get("short_mscale") - self.long_mscale = config.rope_scaling.get("long_mscale") - else: - self.rope_type = "default" - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - def forward(self, x, position_ids): + + self.rope_type = self.config.rope_parameters["rope_type"] + self.rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[PhimoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids=None, layer_type=None): + if layer_type is not None: + raise ValueError( + f"{self.__class__.__name__} does not support layer types, but got `layer_type={layer_type}`" + ) + mscale = None seq_len = torch.max(position_ids) + 1 - if self.config.rope_scaling and seq_len: + if self.config.rope_parameters["rope_type"] != "default" and seq_len: mscale = ( self.long_mscale - if seq_len > self.config.rope_scaling["original_max_position_embeddings"] + if seq_len > self.config.rope_parameters["original_max_position_embeddings"] else self.short_mscale ) inv_freq, attention_scaling = self.rope_init_fn(self.config, x.device, seq_len) @@ -181,8 +221,8 @@ def __init__(self, config: PhimoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -558,7 +598,7 @@ def __init__(self, config: PhimoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Cache] = None, @@ -662,19 +702,17 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) @@ -892,7 +930,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and hasattr(self.config, "original_max_position_embeddings") and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/phimoe/modular_phimoe.py b/src/transformers/models/phimoe/modular_phimoe.py index 7df807ccafac..59f5761987b9 100644 --- a/src/transformers/models/phimoe/modular_phimoe.py +++ b/src/transformers/models/phimoe/modular_phimoe.py @@ -15,7 +15,7 @@ """PyTorch Phimoe model.""" -from typing import Optional +from collections.abc import Callable import torch from torch import nn @@ -33,33 +33,40 @@ MixtralMLP, MixtralModel, MixtralPreTrainedModel, + MixtralRotaryEmbedding, ) from .configuration_phimoe import PhimoeConfig -class PhimoeRotaryEmbedding(nn.Module): - def __init__( - self, - config: Optional[PhimoeConfig] = None, - ): - super().__init__() +class PhimoeRotaryEmbedding(MixtralRotaryEmbedding): + def __init__(self, config: PhimoeConfig, device=None): + nn.Module.__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings self.config = config - if config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - self.short_mscale = config.rope_scaling.get("short_mscale") - self.long_mscale = config.rope_scaling.get("long_mscale") - else: - self.rope_type = "default" - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - def forward(self, x, position_ids): + + self.rope_type = self.config.rope_parameters["rope_type"] + self.rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + def forward(self, x, position_ids=None, layer_type=None): + if layer_type is not None: + raise ValueError( + f"{self.__class__.__name__} does not support layer types, but got `layer_type={layer_type}`" + ) + mscale = None seq_len = torch.max(position_ids) + 1 - if self.config.rope_scaling and seq_len: + if self.config.rope_parameters["rope_type"] != "default" and seq_len: mscale = ( self.long_mscale - if seq_len > self.config.rope_scaling["original_max_position_embeddings"] + if seq_len > self.config.rope_parameters["original_max_position_embeddings"] else self.short_mscale ) inv_freq, attention_scaling = self.rope_init_fn(self.config, x.device, seq_len) @@ -384,7 +391,7 @@ def prepare_inputs_for_generation( # It will cause downside of slower at this single token position, however, better than current failure. if ( past_key_values - and self.config.rope_scaling + and hasattr(self.config, "original_max_position_embeddings") and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1 ): past_length = cache_position[0] diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 719aace6d541..3e2098adcd94 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -13,7 +13,10 @@ # limitations under the License. """Pixtral model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -50,8 +53,8 @@ class PixtralVisionConfig(PreTrainedConfig): Activation function used in the hidden layers. attention_dropout (`float`, *optional*, defaults to 0.0): Dropout probability for the attention layers. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + The RopeParameters initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -74,17 +77,17 @@ class PixtralVisionConfig(PreTrainedConfig): def __init__( self, - hidden_size=1024, - intermediate_size=4096, - num_hidden_layers=24, - num_attention_heads=16, - num_channels=3, - image_size=1024, - patch_size=16, - hidden_act="gelu", - attention_dropout=0.0, - rope_theta=10000.0, - initializer_range=0.02, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 4096, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 16, + num_channels: Optional[int] = 3, + image_size: Optional[int] = 1024, + patch_size: Optional[int] = 16, + hidden_act: Optional[str] = "gelu", + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + initializer_range: Optional[float] = 0.02, **kwargs, ): super().__init__(**kwargs) @@ -98,9 +101,16 @@ def __init__( self.image_size = image_size self.attention_dropout = attention_dropout self.hidden_act = hidden_act - self.rope_theta = rope_theta self.head_dim = hidden_size // num_attention_heads self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/pixtral/modeling_pixtral.py b/src/transformers/models/pixtral/modeling_pixtral.py index 85530c0c01d1..a80fa76c8645 100644 --- a/src/transformers/models/pixtral/modeling_pixtral.py +++ b/src/transformers/models/pixtral/modeling_pixtral.py @@ -59,17 +59,52 @@ class PixtralRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config, device=None): + def __init__(self, config: PixtralVisionConfig, device=None, layer_type=None): super().__init__() - self.rope_type = "default" - self.dim = config.head_dim - self.base = config.rope_theta - max_patches_per_side = config.image_size // config.patch_size - freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim)) - h = torch.arange(max_patches_per_side, device=freqs.device) - w = torch.arange(max_patches_per_side, device=freqs.device) + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + raise ValueError( + f"{self.__class__.__name__} does not support non-default RoPE, but got `rope_type={self.rope_type}`" + ) + inv_freq, attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[PixtralVisionConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Here is the diff from Llama RoPE + max_patches_per_side = config.image_size // config.patch_size + h = torch.arange(max_patches_per_side) + w = torch.arange(max_patches_per_side) + + freqs = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) freqs_h = torch.outer(h, freqs[::2]).float() freqs_w = torch.outer(w, freqs[1::2]).float() inv_freq = torch.cat( @@ -78,17 +113,17 @@ def __init__(self, config, device=None): freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1), ], dim=-1, - ).reshape(-1, self.dim // 2) # we reshape to only index on the position indexes, not tuple of indexes + ).reshape(-1, dim // 2) # we reshape to only index on the position indexes, not tuple of indexes # Different from paper, but it uses a different permutation in order to obtain the same calculation # TODO maybe make it torch compatible later on. We can also just slice - self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False) + inv_freq = torch.cat((inv_freq, inv_freq), dim=-1) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) def forward(self, x, position_ids): freqs = self.inv_freq[position_ids] - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" with torch.autocast(device_type=device_type, enabled=False): # Force float32 emb = freqs diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 5fa2d8126bea..418b18027350 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen2 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -65,45 +67,10 @@ class Qwen2Config(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 4096): @@ -150,25 +117,24 @@ class Qwen2Config(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=4096, - intermediate_size=22016, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=28, - layer_types=None, - attention_dropout=0.0, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 22016, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 28, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -190,14 +156,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -209,6 +171,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, diff --git a/src/transformers/models/qwen2/modeling_qwen2.py b/src/transformers/models/qwen2/modeling_qwen2.py index 2efdfb69ddfe..59e038eb2552 100644 --- a/src/transformers/models/qwen2/modeling_qwen2.py +++ b/src/transformers/models/qwen2/modeling_qwen2.py @@ -47,6 +47,71 @@ def forward(self, x): return down_proj +class Qwen2RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Qwen2Config, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -124,6 +189,7 @@ class Qwen2Attention(nn.Module): def __init__(self, config: Qwen2Config, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -135,7 +201,7 @@ def __init__(self, config: Qwen2Config, layer_idx: int): self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -223,7 +289,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -268,42 +334,6 @@ class Qwen2PreTrainedModel(PreTrainedModel): } -class Qwen2RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Qwen2Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class Qwen2Model(Qwen2PreTrainedModel): def __init__(self, config: Qwen2Config): @@ -374,19 +404,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen2/modular_qwen2.py b/src/transformers/models/qwen2/modular_qwen2.py index 97161c2cf0f8..b06d3182b273 100644 --- a/src/transformers/models/qwen2/modular_qwen2.py +++ b/src/transformers/models/qwen2/modular_qwen2.py @@ -17,6 +17,7 @@ from ...utils import TransformersKwargs, auto_docstring, logging from ...utils.generic import check_model_inputs from ...utils.import_utils import get_torch_version +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -44,14 +45,19 @@ def __init__(self, config): self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) +class Qwen2RotaryEmbedding(Gemma2RotaryEmbedding): + pass + + class Qwen2Attention(LlamaAttention): def __init__(self, config: Qwen2Config, layer_idx: int): + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None super().__init__(config, layer_idx) self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=True) self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -193,19 +199,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 74281ab88f97..69a4e3e0c66f 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -19,8 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -248,8 +250,6 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 32768): @@ -261,43 +261,10 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): Attention pattern for each layer. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -343,25 +310,24 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=152064, - hidden_size=3584, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=32768, - max_window_layers=28, - layer_types=None, - attention_dropout=0.0, + vocab_size: Optional[int] = 152064, + hidden_size: Optional[int] = 3584, + intermediate_size: Optional[int] = 18944, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 28, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 32768, + max_window_layers: Optional[int] = 28, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, **kwargs, ): super().__init__( @@ -387,17 +353,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) - - if self.rope_scaling is None: - self.rope_scaling = {"mrope_section": [16, 24, 24], "rope_type": "default", "type": "default"} + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -409,6 +368,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self, ignore_keys={"mrope_section"}) + class Qwen2_5OmniThinkerConfig(PreTrainedConfig): r""" @@ -599,8 +563,6 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 32768): @@ -610,43 +572,10 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): additional layer afterwards will use SWA (Sliding Window Attention). attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. position_id_per_seconds (`int`, *optional*, defaults to 25): The increment of position id per second. seconds_per_chunk (`int`, *optional*, defaults to 2): @@ -717,12 +646,11 @@ def __init__( head_dim=128, use_cache=True, tie_word_embeddings=False, - rope_theta=1000000.0, use_sliding_window=False, sliding_window=32768, max_window_layers=28, attention_dropout=0.0, - rope_scaling=None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, position_id_per_seconds=25, seconds_per_chunk=2, audio_start_token_id=151647, @@ -768,9 +696,10 @@ def __init__( self.hidden_act = hidden_act self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -789,6 +718,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -847,7 +781,7 @@ def __init__( ff_mult=2, emb_dim=512, head_dim=64, - rope_theta=10000.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, max_position_embeddings=32768, block_size=24, look_ahead_layers=[10], @@ -872,7 +806,6 @@ def __init__( self.ff_mult = ff_mult self.emb_dim = emb_dim self.head_dim = head_dim - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.block_size = block_size self.look_ahead_layers = look_ahead_layers @@ -889,6 +822,14 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py index 08cf3aa1a133..34fa81614d19 100644 --- a/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py @@ -1232,23 +1232,51 @@ class Qwen2_5OmniRotaryEmbedding(nn.Module): def __init__(self, config: Qwen2_5OmniThinkerConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2_5OmniConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + # Ignore copy def forward(self, x, position_ids): # In contrast to other models, Qwen2_5Omni has different position ids for the grids # So we expand the inv_freq to shape (3, ...) @@ -1334,16 +1362,15 @@ def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None - - self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -1354,7 +1381,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -1369,7 +1396,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.config.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -1438,7 +1465,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -1511,8 +1538,8 @@ def __init__(self, config: Qwen2_5OmniTextConfig): ) self._attn_implementation = config._attn_implementation self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) self.has_sliding_layers = "sliding_attention" in self.config.layer_types + self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -1607,8 +1634,6 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # decoder layers @@ -1622,12 +1647,12 @@ def forward( layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=text_position_ids, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) @@ -2086,8 +2111,8 @@ def __init__(self, config: Qwen2_5OmniTalkerConfig): ) self._attn_implementation = config._attn_implementation self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) self.has_sliding_layers = "sliding_attention" in self.config.layer_types + self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -2182,8 +2207,6 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # decoder layers @@ -2197,12 +2220,12 @@ def forward( layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=text_position_ids, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) @@ -2474,33 +2497,67 @@ def _update_model_kwargs_for_generation( return model_kwargs -############################ -# Start Token2Wav # -############################ - - -# Using custom RoPE, will use LlamaRotaryEmbedding next version class Qwen2_5OmniDiTRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, dim, base=10000): + def __init__(self, config: Qwen2_5OmniDiTConfig, device=None): super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2_5OmniDiTConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) - self.register_buffer("inv_freq", inv_freq) - - def forward(self, x): - batch_size, seq_len = x.shape[0], x.shape[1] - t = torch.arange(seq_len, device=x.device) - device_type = x.device.type - device_type = device_type if device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = t.unsqueeze(1).float() @ self.inv_freq.unsqueeze(0).float() - freqs = torch.stack((freqs, freqs), dim=-1) - freqs = freqs.reshape(*freqs.shape[:-2], -1) - freqs = freqs.repeat(batch_size, *([1] * freqs.dim())) - cos = freqs.cos() - sin = freqs.sin() + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @@ -3482,7 +3539,7 @@ def __init__(self, config: Qwen2_5OmniDiTConfig): self.text_embed = DiTCodecEmbedding(config.num_embeds, config.emb_dim, config.repeats) self.input_embed = DiTInputEmbedding(config) - self.rotary_embed = Qwen2_5OmniDiTRotaryEmbedding(config.head_dim) + self.rotary_embed = Qwen2_5OmniDiTRotaryEmbedding(config=config) self.hidden_size = config.hidden_size self.layers = config.num_hidden_layers @@ -3543,7 +3600,9 @@ def forward( ) # Compute positional encodings - position_embeddings = self.rotary_embed(hidden_states) + position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device) + position_ids = position_ids[None, :].repeat(batch_size, 1) + position_embeddings = self.rotary_embed(hidden_states, position_ids) blockwise_difference = self._create_block_diff(hidden_states) # Transformer blocks diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 043c7e4ebf3c..7b7c95f2847b 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -26,7 +26,7 @@ from torch import nn from torch.nn import Parameter -from transformers.models.llama.modeling_llama import rotate_half +from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding, rotate_half from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VisionTransformerPretrainedModel, @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -283,8 +283,6 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 32768): @@ -296,43 +294,10 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): Attention pattern for each layer. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -378,25 +343,24 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=152064, - hidden_size=3584, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=32768, - max_window_layers=28, - layer_types=None, - attention_dropout=0.0, + vocab_size: Optional[int] = 152064, + hidden_size: Optional[int] = 3584, + intermediate_size: Optional[int] = 18944, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 28, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 32768, + max_window_layers: Optional[int] = 28, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, **kwargs, ): super().__init__( @@ -422,17 +386,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) - - if self.rope_scaling is None: - self.rope_scaling = {"mrope_section": [16, 24, 24], "rope_type": "default", "type": "default"} + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -444,6 +401,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self, ignore_keys={"mrope_section"}) + class Qwen2_5OmniThinkerConfig(PreTrainedConfig): r""" @@ -634,8 +596,6 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 32768): @@ -645,43 +605,10 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): additional layer afterwards will use SWA (Sliding Window Attention). attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. position_id_per_seconds (`int`, *optional*, defaults to 25): The increment of position id per second. seconds_per_chunk (`int`, *optional*, defaults to 2): @@ -752,12 +679,11 @@ def __init__( head_dim=128, use_cache=True, tie_word_embeddings=False, - rope_theta=1000000.0, use_sliding_window=False, sliding_window=32768, max_window_layers=28, attention_dropout=0.0, - rope_scaling=None, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, position_id_per_seconds=25, seconds_per_chunk=2, audio_start_token_id=151647, @@ -803,9 +729,10 @@ def __init__( self.hidden_act = hidden_act self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -824,6 +751,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -882,7 +814,7 @@ def __init__( ff_mult=2, emb_dim=512, head_dim=64, - rope_theta=10000.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, max_position_embeddings=32768, block_size=24, look_ahead_layers=[10], @@ -907,7 +839,6 @@ def __init__( self.ff_mult = ff_mult self.emb_dim = emb_dim self.head_dim = head_dim - self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings self.block_size = block_size self.look_ahead_layers = look_ahead_layers @@ -924,6 +855,14 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) super().__init__(**kwargs) @@ -2082,16 +2021,15 @@ def __init__(self, config: Qwen2_5OmniConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None - - self.rotary_emb = Qwen2_5OmniRotaryEmbedding(config=config) + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None class Qwen2MLP(Qwen2_5_VLMLP): @@ -2782,30 +2720,21 @@ def _update_model_kwargs_for_generation( ############################ -# Using custom RoPE, will use LlamaRotaryEmbedding next version -class Qwen2_5OmniDiTRotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, dim, base=10000): - super().__init__() - - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) - self.register_buffer("inv_freq", inv_freq) - - def forward(self, x): - batch_size, seq_len = x.shape[0], x.shape[1] - t = torch.arange(seq_len, device=x.device) - device_type = x.device.type - device_type = device_type if device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = t.unsqueeze(1).float() @ self.inv_freq.unsqueeze(0).float() - freqs = torch.stack((freqs, freqs), dim=-1) - freqs = freqs.reshape(*freqs.shape[:-2], -1) - freqs = freqs.repeat(batch_size, *([1] * freqs.dim())) - cos = freqs.cos() - sin = freqs.sin() - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) +class Qwen2_5OmniDiTRotaryEmbedding(LlamaRotaryEmbedding): + def __init__(self, config: Qwen2_5OmniDiTConfig, device=None): + super().__init__(config, device=device) + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2_5OmniDiTConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + return super().compute_default_rope_parameters( + config, + device=device, + seq_len=seq_len, + ) # Modified from Llama with a different rotate function, will fixed in next release @@ -3785,7 +3714,7 @@ def __init__(self, config: Qwen2_5OmniDiTConfig): self.text_embed = DiTCodecEmbedding(config.num_embeds, config.emb_dim, config.repeats) self.input_embed = DiTInputEmbedding(config) - self.rotary_embed = Qwen2_5OmniDiTRotaryEmbedding(config.head_dim) + self.rotary_embed = Qwen2_5OmniDiTRotaryEmbedding(config=config) self.hidden_size = config.hidden_size self.layers = config.num_hidden_layers @@ -3846,7 +3775,9 @@ def forward( ) # Compute positional encodings - position_embeddings = self.rotary_embed(hidden_states) + position_ids = torch.arange(hidden_states.shape[1], device=hidden_states.device) + position_ids = position_ids[None, :].repeat(batch_size, 1) + position_embeddings = self.rotary_embed(hidden_states, position_ids) blockwise_difference = self._create_block_diff(hidden_states) # Transformer blocks diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 06b1c6965a76..ad556be59a8a 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -23,8 +23,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -109,8 +112,6 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 4096): @@ -122,43 +123,10 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig): Attention pattern for each layer. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. ```python >>> from transformers import Qwen2_5_VLTextModel, Qwen2_5_VLConfig @@ -194,25 +162,24 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=152064, - hidden_size=8192, - intermediate_size=29568, - num_hidden_layers=80, - num_attention_heads=64, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=80, - layer_types=None, - attention_dropout=0.0, - rope_scaling=None, + vocab_size: Optional[int] = 152064, + hidden_size: Optional[int] = 8192, + intermediate_size: Optional[int] = 29568, + num_hidden_layers: Optional[int] = 80, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 80, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -234,9 +201,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -249,14 +217,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations - # one can set it to "linear"/"dynamic" etc. to have scaled RoPE - # TODO: @raushan update config in the hub - if self.rope_scaling is not None and "type" in self.rope_scaling: - if self.rope_scaling["type"] == "mrope": - self.rope_scaling["type"] = "default" - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + if self.rope_parameters["rope_type"] == "mrope": + self.rope_parameters["rope_type"] = "default" rope_config_validation(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 7511eb77379f..122da95565b2 100644 --- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -39,7 +39,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -491,25 +491,53 @@ class Qwen2_5_VLModelOutputWithPast(ModelOutput): class Qwen2_5_VLRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config: Qwen2_5_VLTextConfig, device=None): + def __init__(self, config: Qwen2_5_VLConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2_5_VLConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + # Ignore copy def forward(self, x, position_ids): # In contrast to other models, Qwen2_5_VL has different position ids for the grids # So we expand the inv_freq to shape (3, ...) @@ -611,7 +639,7 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 if (self.head_dim * self.num_heads) != self.hidden_size: @@ -623,9 +651,8 @@ def __init__(self, config: Qwen2_5_VLTextConfig, layer_idx: Optional[int] = None self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None - - self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config) + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -636,7 +663,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -651,7 +678,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.config.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -706,7 +733,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -778,8 +805,8 @@ def __init__(self, config: Qwen2_5_VLTextConfig): ) self._attn_implementation = config._attn_implementation self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config) self.has_sliding_layers = "sliding_attention" in self.config.layer_types + self.rotary_emb = Qwen2_5_VLRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -874,8 +901,6 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # decoder layers @@ -889,12 +914,12 @@ def forward( layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=text_position_ids, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index a5a951d2f89b..f45577e91516 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen2MoE model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -64,45 +66,10 @@ class Qwen2MoeConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 4096): @@ -171,35 +138,34 @@ class Qwen2MoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=2048, - intermediate_size=5632, - num_hidden_layers=24, - num_attention_heads=16, - num_key_value_heads=16, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=28, - attention_dropout=0.0, - decoder_sparse_step=1, - moe_intermediate_size=1408, - shared_expert_intermediate_size=5632, - num_experts_per_tok=4, - num_experts=60, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, - qkv_bias=True, - layer_types=None, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 5632, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 16, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 28, + attention_dropout: Optional[float] = 0.0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 1408, + shared_expert_intermediate_size: Optional[int] = 5632, + num_experts_per_tok: Optional[int] = 4, + num_experts: Optional[int] = 60, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[bool] = None, + qkv_bias: Optional[bool] = True, + layer_types: Optional[list[str]] = None, **kwargs, ): self.layer_types = layer_types @@ -218,14 +184,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -246,6 +208,12 @@ def __init__( for i in range(self.num_hidden_layers) ] layer_type_validation(self.layer_types) + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index 24a86fbf21d5..55af7da767ce 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -77,20 +77,49 @@ class Qwen2MoeRotaryEmbedding(nn.Module): def __init__(self, config: Qwen2MoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2MoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -219,8 +248,8 @@ def __init__(self, config: Qwen2MoeConfig, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[TransformersKwargs], @@ -351,7 +380,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -463,8 +492,6 @@ def forward( } hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for i, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]): diff --git a/src/transformers/models/qwen2_moe/modular_qwen2_moe.py b/src/transformers/models/qwen2_moe/modular_qwen2_moe.py index 9b82be47635f..56c100f94b93 100644 --- a/src/transformers/models/qwen2_moe/modular_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modular_qwen2_moe.py @@ -39,7 +39,8 @@ from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs from ..gemma.modeling_gemma import GemmaMLP -from ..llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding +from ..llama.modeling_llama import LlamaAttention, LlamaDecoderLayer, LlamaRMSNorm from ..mixtral.modeling_mixtral import ( MixtralExperts, MixtralForCausalLM, @@ -53,7 +54,7 @@ class Qwen2MoeRMSNorm(LlamaRMSNorm): pass -class Qwen2MoeRotaryEmbedding(LlamaRotaryEmbedding): +class Qwen2MoeRotaryEmbedding(Gemma2RotaryEmbedding): pass @@ -207,8 +208,6 @@ def forward( } hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for i, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]): diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 12c0ce8509cf..043ae4ee993e 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen2VL model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -98,8 +100,6 @@ class Qwen2VLTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 1000000.0): - The base period of the RoPE embeddings. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*, defaults to 4096): @@ -111,43 +111,10 @@ class Qwen2VLTextConfig(PreTrainedConfig): Attention pattern for each layer. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. ```python >>> from transformers import Qwen2VLTextModel, Qwen2VLConfig @@ -183,25 +150,24 @@ class Qwen2VLTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=152064, - hidden_size=8192, - intermediate_size=29568, - num_hidden_layers=80, - num_attention_heads=64, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-05, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=80, - layer_types=None, - attention_dropout=0.0, - rope_scaling=None, + vocab_size: Optional[int] = 152064, + hidden_size: Optional[int] = 8192, + intermediate_size: Optional[int] = 29568, + num_hidden_layers: Optional[int] = 80, + num_attention_heads: Optional[int] = 64, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-05, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 80, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -223,9 +189,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -238,14 +205,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations - # one can set it to "linear"/"dynamic" etc. to have scaled RoPE - # TODO: @raushan update config in the hub - if self.rope_scaling is not None and "type" in self.rope_scaling: - if self.rope_scaling["type"] == "mrope": - self.rope_scaling["type"] = "default" - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + if self.rope_parameters["rope_type"] == "mrope": + self.rope_parameters["rope_type"] = "default" rope_config_validation(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py index 3489728b62eb..2e6ef2727ec5 100644 --- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py @@ -35,7 +35,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( @@ -107,28 +107,57 @@ class Qwen2VLCausalLMOutputWithPast(ModelOutput): rope_deltas: Optional[torch.LongTensor] = None +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Qwen2VL class Qwen2VLRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config: Qwen2VLTextConfig, device=None): + def __init__(self, config: Qwen2VLConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen2VLConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + # Ignore copy def forward(self, x, position_ids): # In contrast to other models, Qwen2_VL has different position ids for the grids # So we expand the inv_freq to shape (3, ...) @@ -471,7 +500,7 @@ def __init__(self, config: Qwen2VLTextConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.is_causal = True self.attention_dropout = config.attention_dropout - self.rope_scaling = config.rope_scaling + self.rope_parameters = config.rope_parameters self.scaling = self.head_dim**-0.5 if (self.head_dim * self.num_heads) != self.hidden_size: @@ -483,9 +512,8 @@ def __init__(self, config: Qwen2VLTextConfig, layer_idx: Optional[int] = None): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True) self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None - - self.rotary_emb = Qwen2VLRotaryEmbedding(config=config) + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -496,7 +524,7 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -511,7 +539,7 @@ def forward( cos, sin = position_embeddings query_states, key_states = apply_multimodal_rotary_pos_emb( - query_states, key_states, cos, sin, self.rope_scaling["mrope_section"] + query_states, key_states, cos, sin, self.config.rope_parameters["mrope_section"] ) if past_key_values is not None: @@ -566,7 +594,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ @@ -749,8 +777,8 @@ def __init__(self, config: Qwen2VLTextConfig): ) self._attn_implementation = config._attn_implementation self.norm = Qwen2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = Qwen2VLRotaryEmbedding(config=config) self.has_sliding_layers = "sliding_attention" in self.config.layer_types + self.rotary_emb = Qwen2VLRotaryEmbedding(config=config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -845,8 +873,6 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # decoder layers @@ -860,12 +886,12 @@ def forward( layer_outputs = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=text_position_ids, past_key_values=past_key_values, output_attentions=output_attentions, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index ed21fbe767a8..90edaff6aaa3 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen3 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -67,45 +69,10 @@ class Qwen3Config(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. use_sliding_window (`bool`, *optional*, defaults to `False`): @@ -154,27 +121,26 @@ class Qwen3Config(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=4096, - intermediate_size=22016, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - head_dim=128, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - use_sliding_window=False, - sliding_window=4096, - max_window_layers=28, - layer_types=None, - attention_dropout=0.0, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 22016, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + max_window_layers: Optional[int] = 28, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -197,15 +163,11 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -217,6 +179,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, diff --git a/src/transformers/models/qwen3/modeling_qwen3.py b/src/transformers/models/qwen3/modeling_qwen3.py index 6af7a8c0316f..1973de1b19ef 100644 --- a/src/transformers/models/qwen3/modeling_qwen3.py +++ b/src/transformers/models/qwen3/modeling_qwen3.py @@ -83,6 +83,71 @@ def forward(self, x): return down_proj +class Qwen3RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Qwen3Config, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -160,6 +225,7 @@ class Qwen3Attention(nn.Module): def __init__(self, config: Qwen3Config, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -182,7 +248,7 @@ def __init__(self, config: Qwen3Config, layer_idx: int): ) self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim! self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -249,7 +315,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -294,42 +360,6 @@ class Qwen3PreTrainedModel(PreTrainedModel): } -class Qwen3RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Qwen3Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class Qwen3Model(Qwen3PreTrainedModel): def __init__(self, config: Qwen3Config): @@ -400,19 +430,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen3/modular_qwen3.py b/src/transformers/models/qwen3/modular_qwen3.py index fb7c2da0ca95..2f113785a94e 100644 --- a/src/transformers/models/qwen3/modular_qwen3.py +++ b/src/transformers/models/qwen3/modular_qwen3.py @@ -30,14 +30,12 @@ LlamaAttention, ) from ..qwen2.modeling_qwen2 import ( - Qwen2DecoderLayer, Qwen2ForCausalLM, Qwen2ForQuestionAnswering, Qwen2ForSequenceClassification, Qwen2ForTokenClassification, - Qwen2Model, - Qwen2PreTrainedModel, Qwen2RMSNorm, + Qwen2RotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) @@ -57,12 +55,17 @@ class Qwen3MLP(GemmaMLP): pass +class Qwen3RotaryEmbedding(Qwen2RotaryEmbedding): + pass + + class Qwen3Attention(LlamaAttention): def __init__(self, config: Qwen3Config, layer_idx: int): + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None super().__init__(config, layer_idx) self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # unlike olmo, only on the head dim! self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps) # thus post q_norm does not need reshape - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -109,18 +112,6 @@ def forward( return attn_output, attn_weights -class Qwen3DecoderLayer(Qwen2DecoderLayer): - pass - - -class Qwen3PreTrainedModel(Qwen2PreTrainedModel): - pass - - -class Qwen3Model(Qwen2Model): - pass - - class Qwen3ForCausalLM(Qwen2ForCausalLM): def forward( self, @@ -166,8 +157,8 @@ class Qwen3ForQuestionAnswering(Qwen2ForQuestionAnswering): __all__ = [ "Qwen3ForCausalLM", "Qwen3ForQuestionAnswering", - "Qwen3PreTrainedModel", - "Qwen3Model", + "Qwen3PreTrainedModel", # noqa: F822 + "Qwen3Model", # noqa: F822 "Qwen3ForSequenceClassification", "Qwen3ForTokenClassification", ] diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 5f25cc232fe3..e5003c509118 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen3MoE model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -65,45 +67,10 @@ class Qwen3MoeConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. use_sliding_window (`bool`, *optional*, defaults to `False`): @@ -169,32 +136,31 @@ class Qwen3MoeConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=2048, - intermediate_size=6144, - num_hidden_layers=24, - num_attention_heads=32, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - use_sliding_window=False, - sliding_window=4096, - attention_dropout=0.0, - decoder_sparse_step=1, - moe_intermediate_size=768, - num_experts_per_tok=8, - num_experts=128, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 6144, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = 4096, + attention_dropout: Optional[float] = 0.0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 768, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 128, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[bool] = None, **kwargs, ): self.vocab_size = vocab_size @@ -211,14 +177,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments diff --git a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py index 6385f6f9bddf..7c0a7cb72a77 100644 --- a/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py @@ -311,7 +311,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -361,20 +361,49 @@ class Qwen3MoeRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3MoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3MoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -451,19 +480,17 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, + position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py index 1f07a8e20e5e..87a4bbfa9625 100644 --- a/src/transformers/models/qwen3_moe/modular_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/modular_qwen3_moe.py @@ -49,6 +49,7 @@ class Qwen3MoeAttention(Qwen3Attention): # This is the main diff with qwen2Moe! def __init__(self, config: Qwen3MoeConfig, layer_idx: int): super().__init__(config, layer_idx) + del self.layer_type self.sliding_window = getattr(config, "sliding_window", None) diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 564f3938ac2b..da6dde8c9db7 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -14,8 +14,10 @@ # limitations under the License. """Qwen3-Next model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -65,45 +67,10 @@ class Qwen3NextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. partial_rotary_factor (`float`, *optional*, defaults to 0.25): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): @@ -186,39 +153,38 @@ class Qwen3NextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=2048, - intermediate_size=5632, - num_hidden_layers=48, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - partial_rotary_factor=0.25, - attention_bias=False, - attention_dropout=0.0, - head_dim=256, - linear_conv_kernel_dim=4, - linear_key_head_dim=128, - linear_value_head_dim=128, - linear_num_key_heads=16, - linear_num_value_heads=32, - decoder_sparse_step=1, - moe_intermediate_size=512, - shared_expert_intermediate_size=512, - num_experts_per_tok=10, - num_experts=512, - norm_topk_prob=True, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=[], - layer_types=None, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 5632, + num_hidden_layers: Optional[int] = 48, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + partial_rotary_factor: Optional[float] = 0.25, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + head_dim: Optional[int] = 256, + linear_conv_kernel_dim: Optional[int] = 4, + linear_key_head_dim: Optional[int] = 128, + linear_value_head_dim: Optional[int] = 128, + linear_num_key_heads: Optional[int] = 16, + linear_num_value_heads: Optional[int] = 32, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 512, + shared_expert_intermediate_size: Optional[int] = 512, + num_experts_per_tok: Optional[int] = 10, + num_experts: Optional[int] = 512, + norm_topk_prob: Optional[bool] = True, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[list[int]] = [], + layer_types: Optional[list[str]] = None, **kwargs, ): super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -233,13 +199,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -250,6 +216,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = getattr(self, "rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + # linear attention part self.linear_conv_kernel_dim = linear_conv_kernel_dim self.linear_key_head_dim = linear_key_head_dim diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py index 282327b4f96d..036d2a30a55e 100644 --- a/src/transformers/models/qwen3_next/modeling_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modeling_qwen3_next.py @@ -178,20 +178,51 @@ class Qwen3NextRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3NextConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3NextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1006,8 +1037,6 @@ def forward( linear_attn_mask = self._update_linear_attn_mask(attention_mask, cache_position) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py index 0f8353c0e47f..1e9962797775 100644 --- a/src/transformers/models/qwen3_next/modular_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modular_qwen3_next.py @@ -35,6 +35,7 @@ is_flash_linear_attention_available, ) from ..bamba.modeling_bamba import apply_mask_to_padding_states, apply_rotary_pos_emb +from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding from ..gemma3.modeling_gemma3 import Gemma3RMSNorm from ..llama.modeling_llama import ( LlamaForQuestionAnswering, @@ -47,7 +48,6 @@ Qwen3MoeAttention, Qwen3MoeDecoderLayer, Qwen3MoeMLP, - Qwen3MoeRotaryEmbedding, eager_attention_forward, ) from .configuration_qwen3_next import Qwen3NextConfig @@ -181,8 +181,38 @@ def has_previous_state(self): return self.conv_states[self.last_linear_layer] is not None -class Qwen3NextRotaryEmbedding(Qwen3MoeRotaryEmbedding): - pass +class Qwen3NextRotaryEmbedding(Gemma2RotaryEmbedding): + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3NextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor class Qwen3NextRMSNorm(Gemma3RMSNorm): @@ -766,8 +796,6 @@ def forward( linear_attn_mask = self._update_linear_attn_mask(attention_mask, cache_position) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index a96b7699c7b5..281c2a2bf509 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -19,8 +19,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -90,23 +92,23 @@ class Qwen3OmniMoeAudioEncoderConfig(PreTrainedConfig): def __init__( self, - num_mel_bins=128, - encoder_layers=32, - encoder_attention_heads=20, - encoder_ffn_dim=5120, - d_model=1280, - dropout=0, - attention_dropout=0, - activation_function="gelu", - activation_dropout=0, - scale_embedding=False, - initializer_range=0.02, - max_source_positions=1500, - n_window=100, - output_dim=3584, - n_window_infer=400, - conv_chunksize=500, - downsample_hidden_size=480, + num_mel_bins: Optional[int] = 128, + encoder_layers: Optional[int] = 32, + encoder_attention_heads: Optional[int] = 20, + encoder_ffn_dim: Optional[int] = 5120, + d_model: Optional[int] = 1280, + dropout: Optional[int] = 0, + attention_dropout: Optional[int] = 0, + activation_function: Optional[int] = "gelu", + activation_dropout: Optional[int] = 0, + scale_embedding: Optional[int] = False, + initializer_range: Optional[int] = 0.02, + max_source_positions: Optional[int] = 1500, + n_window: Optional[int] = 100, + output_dim: Optional[int] = 3584, + n_window_infer: Optional[int] = 400, + conv_chunksize: Optional[int] = 500, + downsample_hidden_size: Optional[int] = 480, **kwargs, ): super().__init__(**kwargs) @@ -212,45 +214,10 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. use_sliding_window (`bool`, *optional*, defaults to `False`): @@ -316,31 +283,30 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=3584, - hidden_size=2048, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - attention_dropout=0, - decoder_sparse_step=1, - moe_intermediate_size=768, - num_experts_per_tok=8, - num_experts=128, - norm_topk_prob=True, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, + vocab_size: Optional[int] = 3584, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 18944, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 28, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[int] = 0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 768, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 128, + norm_topk_prob: Optional[bool] = True, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[list[int]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -356,14 +322,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments @@ -528,45 +495,10 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. use_sliding_window (`bool`, *optional*, defaults to `False`): @@ -615,26 +547,25 @@ class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): def __init__( self, - vocab_size=2048, - hidden_size=1024, - intermediate_size=3072, - num_hidden_layers=5, - num_attention_heads=16, - num_key_value_heads=8, - head_dim=128, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=0.000001, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - layer_types=None, - attention_dropout=0, - num_code_groups=32, + vocab_size: Optional[int] = 2048, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 3072, + num_hidden_layers: Optional[int] = 5, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.000001, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[int] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[int] = 0, + num_code_groups: Optional[int] = 32, **kwargs, ): self.vocab_size = vocab_size @@ -655,15 +586,11 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] - rope_config_validation(self) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -675,6 +602,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, @@ -725,45 +657,10 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. use_sliding_window (`bool`, *optional*, defaults to `False`): @@ -829,31 +726,30 @@ class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=3072, - hidden_size=1024, - intermediate_size=2048, - num_hidden_layers=20, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=0.000001, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - attention_dropout=0, - decoder_sparse_step=1, - moe_intermediate_size=384, - num_experts_per_tok=8, - num_experts=128, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, + vocab_size: Optional[int] = 3072, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 20, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.000001, + use_cache: Optional[int] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[int] = 0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 384, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 128, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[list[int]] = None, **kwargs, ): self.vocab_size = vocab_size @@ -869,14 +765,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) # MoE arguments diff --git a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py index 60bf314c00bf..2e167cdbea65 100644 --- a/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modeling_qwen3_omni_moe.py @@ -1219,38 +1219,51 @@ class Qwen3OmniMoeThinkerTextRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3OmniMoeTextConfig, device=None): super().__init__() - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", "default") - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq - self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20]) + self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20]) - def apply_interleaved_mrope(self, freqs, mrope_section): - """Apply interleaved MRoPE to 3D rotary embeddings. - Reorganizes frequency layout from chunked [TTT...HHH...WWW] to - interleaved [THTHWHTHW...TT], preserving frequency continuity. - args: - x: (3, bs, seq_len, head_dim // 2) - mrope_section: (3,) - returns: - x_t: (bs, seq_len, head_dim // 2) + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3OmniMoeTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: """ - freqs_t = freqs[0] # just overwrite the first dimension T - for dim, offset in enumerate((1, 2), start=1): # H, W - length = mrope_section[dim] * 3 - idx = slice(offset, length, 3) - freqs_t[..., idx] = freqs[dim, ..., idx] - return freqs_t + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -1272,6 +1285,23 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + def apply_interleaved_mrope(self, freqs, mrope_section): + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + args: + x: (3, bs, seq_len, head_dim // 2) + mrope_section: (3,) + returns: + x_t: (bs, seq_len, head_dim // 2) + """ + freqs_t = freqs[0] # just overwrite the first dimension T + for dim, offset in enumerate((1, 2), start=1): # H, W + length = mrope_section[dim] * 3 + idx = slice(offset, length, 3) + freqs_t[..., idx] = freqs[dim, ..., idx] + return freqs_t + class Qwen3OmniMoeThinkerTextMLP(nn.Module): def __init__(self, config, intermediate_size=None): @@ -1496,7 +1526,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -2248,6 +2278,7 @@ class Qwen3OmniMoeTalkerCodePredictorAttention(nn.Module): def __init__(self, config: Qwen3OmniMoeConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -2272,7 +2303,7 @@ def __init__(self, config: Qwen3OmniMoeConfig, layer_idx: int): self.k_norm = Qwen3OmniMoeRMSNorm( self.head_dim, eps=config.rms_norm_eps ) # thus post q_norm does not need reshape - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, @@ -2354,7 +2385,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -2385,20 +2416,49 @@ class Qwen3OmniMoeRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3OmniMoeConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3OmniMoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -2492,7 +2552,7 @@ def forward( hidden_states = inputs_embeds - # create position embeddings to be shared across the decoder layers + hidden_states = inputs_embeds position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: @@ -2609,16 +2669,76 @@ def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_ @dataclass class Qwen3OmniMoeTalkerOutputWithPast(MoeCausalLMOutputWithPast): r""" - Args: - generation_step (`int`, *optional*): - Current generation step, used to track which `trailing_text_hidden` should be used. + generation_step (`int`, *optional*): + Current generation step, used to track which `trailing_text_hidden` should be used. """ generation_step: Optional[int] = None -class Qwen3OmniMoeTalkerRotaryEmbedding(Qwen3OmniMoeThinkerTextRotaryEmbedding): - pass +class Qwen3OmniMoeTalkerRotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: Qwen3OmniMoeConfig, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3OmniMoeConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) class Qwen3OmniMoeTalkerTextMLP(nn.Module): @@ -2731,7 +2851,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -3229,47 +3349,12 @@ def forward(self, hidden_states): return hidden_states -class Qwen3OmniMoeCode2WavRotatoryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: Qwen3OmniMoeConfig, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - class Qwen3OmniMoeCode2WavAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config: Qwen3OmniMoeCode2WavConfig, layer_idx): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -3531,19 +3616,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 1bdc46fbba82..f511b57bcf84 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -41,6 +41,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput from ...utils import auto_docstring, can_return_tuple, logging @@ -108,23 +109,23 @@ def _get_feat_extract_output_lengths(input_lengths): class Qwen3OmniMoeAudioEncoderConfig(Qwen2_5OmniAudioEncoderConfig): def __init__( self, - num_mel_bins=128, - encoder_layers=32, - encoder_attention_heads=20, - encoder_ffn_dim=5120, - d_model=1280, - dropout=0, - attention_dropout=0, - activation_function="gelu", - activation_dropout=0, - scale_embedding=False, - initializer_range=0.02, - max_source_positions=1500, - n_window=100, - output_dim=3584, - n_window_infer=400, - conv_chunksize=500, - downsample_hidden_size=480, + num_mel_bins: Optional[int] = 128, + encoder_layers: Optional[int] = 32, + encoder_attention_heads: Optional[int] = 20, + encoder_ffn_dim: Optional[int] = 5120, + d_model: Optional[int] = 1280, + dropout: Optional[int] = 0, + attention_dropout: Optional[int] = 0, + activation_function: Optional[int] = "gelu", + activation_dropout: Optional[int] = 0, + scale_embedding: Optional[int] = False, + initializer_range: Optional[int] = 0.02, + max_source_positions: Optional[int] = 1500, + n_window: Optional[int] = 100, + output_dim: Optional[int] = 3584, + n_window_infer: Optional[int] = 400, + conv_chunksize: Optional[int] = 500, + downsample_hidden_size: Optional[int] = 480, **kwargs, ): super().__init__( @@ -156,31 +157,30 @@ class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): class Qwen3OmniMoeTextConfig(Qwen3MoeConfig): def __init__( self, - vocab_size=3584, - hidden_size=2048, - intermediate_size=18944, - num_hidden_layers=28, - num_attention_heads=28, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=1000000.0, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - attention_dropout=0, - decoder_sparse_step=1, - moe_intermediate_size=768, - num_experts_per_tok=8, - num_experts=128, - norm_topk_prob=True, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, + vocab_size: Optional[int] = 3584, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 18944, + num_hidden_layers: Optional[int] = 28, + num_attention_heads: Optional[int] = 28, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[int] = 0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 768, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 128, + norm_topk_prob: Optional[bool] = True, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[list[int]] = None, **kwargs, ): super().__init__( @@ -196,8 +196,7 @@ def __init__( rms_norm_eps, use_cache, tie_word_embeddings, - rope_theta, - rope_scaling, + rope_parameters, attention_bias, False, sliding_window, @@ -215,6 +214,11 @@ def __init__( del self.use_sliding_window self.sliding_window = sliding_window + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 1000000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): r""" @@ -311,26 +315,25 @@ def __init__( class Qwen3OmniMoeTalkerCodePredictorConfig(Qwen3Config): def __init__( self, - vocab_size=2048, - hidden_size=1024, - intermediate_size=3072, - num_hidden_layers=5, - num_attention_heads=16, - num_key_value_heads=8, - head_dim=128, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=0.000001, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - layer_types=None, - attention_dropout=0, - num_code_groups=32, + vocab_size: Optional[int] = 2048, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 3072, + num_hidden_layers: Optional[int] = 5, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 8, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.000001, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[int] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + layer_types: Optional[list[str]] = None, + attention_dropout: Optional[int] = 0, + num_code_groups: Optional[int] = 32, **kwargs, ): super().__init__( @@ -347,8 +350,7 @@ def __init__( rms_norm_eps, use_cache, tie_word_embeddings, - rope_theta, - rope_scaling, + rope_parameters, attention_bias, False, sliding_window, @@ -366,31 +368,30 @@ def __init__( class Qwen3OmniMoeTalkerTextConfig(Qwen3MoeConfig): def __init__( self, - vocab_size=3072, - hidden_size=1024, - intermediate_size=2048, - num_hidden_layers=20, - num_attention_heads=16, - num_key_value_heads=2, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=0.000001, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10000, - rope_scaling=None, - attention_bias=False, - sliding_window=None, - attention_dropout=0, - decoder_sparse_step=1, - moe_intermediate_size=384, - num_experts_per_tok=8, - num_experts=128, - norm_topk_prob=False, - output_router_logits=False, - router_aux_loss_coef=0.001, - mlp_only_layers=None, + vocab_size: Optional[int] = 3072, + hidden_size: Optional[int] = 1024, + intermediate_size: Optional[int] = 2048, + num_hidden_layers: Optional[int] = 20, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 0.000001, + use_cache: Optional[int] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + sliding_window: Optional[int] = None, + attention_dropout: Optional[int] = 0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 384, + num_experts_per_tok: Optional[int] = 8, + num_experts: Optional[int] = 128, + norm_topk_prob: Optional[bool] = False, + output_router_logits: Optional[bool] = False, + router_aux_loss_coef: Optional[float] = 0.001, + mlp_only_layers: Optional[list[int]] = None, **kwargs, ): super().__init__( @@ -406,8 +407,7 @@ def __init__( rms_norm_eps, use_cache, tie_word_embeddings, - rope_theta, - rope_scaling, + rope_parameters, attention_bias, False, sliding_window, @@ -1494,6 +1494,10 @@ def __init__(self, config, layer_idx): self.self_attn = Qwen3OmniMoeTalkerCodePredictorAttention(config=config, layer_idx=layer_idx) +class Qwen3OmniMoeRotaryEmbedding(Qwen3RotaryEmbedding): + pass + + class Qwen3OmniMoeTalkerCodePredictorModel(Qwen3Model): config_class = Qwen3OmniMoeTalkerCodePredictorConfig base_model_prefix = "talker.code_predictor.model" @@ -1564,7 +1568,7 @@ def forward( hidden_states = inputs_embeds - # create position embeddings to be shared across the decoder layers + hidden_states = inputs_embeds position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: @@ -1668,15 +1672,14 @@ def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_ @dataclass class Qwen3OmniMoeTalkerOutputWithPast(MoeCausalLMOutputWithPast): r""" - Args: - generation_step (`int`, *optional*): - Current generation step, used to track which `trailing_text_hidden` should be used. + generation_step (`int`, *optional*): + Current generation step, used to track which `trailing_text_hidden` should be used. """ generation_step: Optional[int] = None -class Qwen3OmniMoeTalkerRotaryEmbedding(Qwen3OmniMoeThinkerTextRotaryEmbedding): +class Qwen3OmniMoeTalkerRotaryEmbedding(Qwen3RotaryEmbedding): pass @@ -2042,10 +2045,6 @@ def forward(self, hidden_states): return hidden_states -class Qwen3OmniMoeCode2WavRotatoryEmbedding(Qwen3RotaryEmbedding): - pass - - class Qwen3OmniMoeCode2WavAttention(Qwen3Attention): def __init__(self, config: Qwen3OmniMoeCode2WavConfig, layer_idx): super().__init__(config, layer_idx) diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 2d1f6a6d4bd9..f4228ddb3f87 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -18,8 +18,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Qwen3VLVisionConfig(PreTrainedConfig): @@ -104,45 +106,10 @@ class Qwen3VLTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 5000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -166,23 +133,22 @@ class Qwen3VLTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=4096, - intermediate_size=22016, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - head_dim=128, - hidden_act="silu", - max_position_embeddings=128000, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=5000000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 22016, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 128000, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -202,11 +168,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 5000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py index 7d9a00d9cf31..f58550ffe2d9 100644 --- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py @@ -280,38 +280,51 @@ class Qwen3VLTextRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3VLTextConfig, device=None): super().__init__() - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", "default") - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq - self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20]) + self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20]) - def apply_interleaved_mrope(self, freqs, mrope_section): - """Apply interleaved MRoPE to 3D rotary embeddings. - Reorganizes frequency layout from chunked [TTT...HHH...WWW] to - interleaved [THTHWHTHW...TT], preserving frequency continuity. - args: - x: (3, bs, seq_len, head_dim // 2) - mrope_section: (3,) - returns: - x_t: (bs, seq_len, head_dim // 2) + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3VLTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: """ - freqs_t = freqs[0] # just overwrite the first dimension T - for dim, offset in enumerate((1, 2), start=1): # H, W - length = mrope_section[dim] * 3 - idx = slice(offset, length, 3) - freqs_t[..., idx] = freqs[dim, ..., idx] - return freqs_t + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -333,6 +346,23 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + def apply_interleaved_mrope(self, freqs, mrope_section): + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + args: + x: (3, bs, seq_len, head_dim // 2) + mrope_section: (3,) + returns: + x_t: (bs, seq_len, head_dim // 2) + """ + freqs_t = freqs[0] # just overwrite the first dimension T + for dim, offset in enumerate((1, 2), start=1): # H, W + length = mrope_section[dim] * 3 + idx = slice(offset, length, 3) + freqs_t[..., idx] = freqs[dim, ..., idx] + return freqs_t + @use_kernel_forward_from_hub("RMSNorm") class Qwen3VLTextRMSNorm(nn.Module): @@ -387,6 +417,7 @@ class Qwen3VLTextAttention(nn.Module): def __init__(self, config: Qwen3VLTextConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 1aa9774f223b..5d1c88d03bc4 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,13 +30,14 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update, rope_config_validation +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import auto_docstring, is_torchdynamo_compiling, logging from ...utils.generic import check_model_inputs from ...video_utils import VideoInput +from ..llama.modeling_llama import LlamaRotaryEmbedding from ..qwen2_5_vl.modeling_qwen2_5_vl import ( Qwen2_5_VLCausalLMOutputWithPast, Qwen2_5_VLForConditionalGeneration, @@ -146,45 +147,10 @@ class Qwen3VLTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 5000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -208,23 +174,22 @@ class Qwen3VLTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=4096, - intermediate_size=22016, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - head_dim=128, - hidden_act="silu", - max_position_embeddings=128000, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=5000000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 22016, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + head_dim: Optional[int] = 128, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 128000, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, **kwargs, ): self.vocab_size = vocab_size @@ -244,11 +209,15 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 5000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -386,26 +355,13 @@ def __init__(self, config, attn_implementation: str = "sdpa") -> None: self.mlp = Qwen3VLVisionMLP(config=config) -class Qwen3VLTextRotaryEmbedding(nn.Module): +class Qwen3VLTextRotaryEmbedding(LlamaRotaryEmbedding): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: Qwen3VLTextConfig, device=None): - super().__init__() - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", "default") - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + super().__init__(config, device=device) - self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20]) + self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20]) def apply_interleaved_mrope(self, freqs, mrope_section): """Apply interleaved MRoPE to 3D rotary embeddings. diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index b05bb7d8a01d..eab77fa368a2 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -18,8 +18,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Qwen3VLMoeTextConfig(PreTrainedConfig): @@ -64,8 +67,6 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 5000000.0): - The base period of the RoPE embeddings. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -78,51 +79,14 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): Number of selected experts. num_experts (`int`, *optional*, defaults to 60): Number of routed experts. - norm_topk_prob (`bool`, *optional*, defaults to `True`): - Whether to normalize the topk probabilities. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): - The aux loss factor for the total loss. mlp_only_layers (`List[int]`, *optional*, defaults to `[]`): Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock The list contains layer index, from 0 to num_layers-1 if we have num_layers layers If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. head_dim (`int`, *optional*): The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`. @@ -160,30 +124,27 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=2048, - intermediate_size=5632, - num_hidden_layers=24, - num_attention_heads=16, - num_key_value_heads=16, - hidden_act="silu", - max_position_embeddings=128000, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=5000000.0, - attention_bias=False, - attention_dropout=0.0, - decoder_sparse_step=1, - moe_intermediate_size=1408, - num_experts_per_tok=4, - num_experts=60, - norm_topk_prob=True, - router_aux_loss_coef=0.001, - mlp_only_layers=None, - rope_scaling=None, - head_dim=None, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 5632, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 16, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 128000, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 4, + num_experts: Optional[int] = 60, + mlp_only_layers: Optional[list[int]] = None, + rope_parameters: Optional[RopeParameters] = None, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -202,12 +163,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling self.head_dim = head_dim or hidden_size // num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 5000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments @@ -215,8 +180,6 @@ def __init__( self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py index fa6446241d33..264902c2d8a4 100644 --- a/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modeling_qwen3_vl_moe.py @@ -229,6 +229,7 @@ class Qwen3VLMoeTextAttention(nn.Module): def __init__(self, config: Qwen3VLMoeTextConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -338,7 +339,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -777,38 +778,51 @@ class Qwen3VLMoeTextRotaryEmbedding(nn.Module): def __init__(self, config: Qwen3VLMoeTextConfig, device=None): super().__init__() - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - self.rope_type = config.rope_scaling.get("rope_type", "default") - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq - self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20]) + self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20]) - def apply_interleaved_mrope(self, freqs, mrope_section): - """Apply interleaved MRoPE to 3D rotary embeddings. - Reorganizes frequency layout from chunked [TTT...HHH...WWW] to - interleaved [THTHWHTHW...TT], preserving frequency continuity. - args: - x: (3, bs, seq_len, head_dim // 2) - mrope_section: (3,) - returns: - x_t: (bs, seq_len, head_dim // 2) + @staticmethod + def compute_default_rope_parameters( + config: Optional[Qwen3VLMoeTextConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: """ - freqs_t = freqs[0] # just overwrite the first dimension T - for dim, offset in enumerate((1, 2), start=1): # H, W - length = mrope_section[dim] * 3 - idx = slice(offset, length, 3) - freqs_t[..., idx] = freqs[dim, ..., idx] - return freqs_t + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -830,6 +844,23 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + def apply_interleaved_mrope(self, freqs, mrope_section): + """Apply interleaved MRoPE to 3D rotary embeddings. + Reorganizes frequency layout from chunked [TTT...HHH...WWW] to + interleaved [THTHWHTHW...TT], preserving frequency continuity. + args: + x: (3, bs, seq_len, head_dim // 2) + mrope_section: (3,) + returns: + x_t: (bs, seq_len, head_dim // 2) + """ + freqs_t = freqs[0] # just overwrite the first dimension T + for dim, offset in enumerate((1, 2), start=1): # H, W + length = mrope_section[dim] * 3 + idx = slice(offset, length, 3) + freqs_t[..., idx] = freqs[dim, ..., idx] + return freqs_t + @auto_docstring( custom_intro=( diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 1cd833cdfeae..c0c4be2ddb68 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -22,7 +22,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -88,8 +88,6 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to 5000000.0): - The base period of the RoPE embeddings. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -102,51 +100,14 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): Number of selected experts. num_experts (`int`, *optional*, defaults to 60): Number of routed experts. - norm_topk_prob (`bool`, *optional*, defaults to `True`): - Whether to normalize the topk probabilities. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): - The aux loss factor for the total loss. mlp_only_layers (`List[int]`, *optional*, defaults to `[]`): Indicate which layers use Qwen3VLMoeMLP rather than Qwen3VLMoeSparseMoeBlock The list contains layer index, from 0 to num_layers-1 if we have num_layers layers If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. head_dim (`int`, *optional*): The dimension of the head. If not specified, will default to `hidden_size // num_attention_heads`. @@ -184,30 +145,27 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): def __init__( self, - vocab_size=151936, - hidden_size=2048, - intermediate_size=5632, - num_hidden_layers=24, - num_attention_heads=16, - num_key_value_heads=16, - hidden_act="silu", - max_position_embeddings=128000, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - tie_word_embeddings=False, - rope_theta=5000000.0, - attention_bias=False, - attention_dropout=0.0, - decoder_sparse_step=1, - moe_intermediate_size=1408, - num_experts_per_tok=4, - num_experts=60, - norm_topk_prob=True, - router_aux_loss_coef=0.001, - mlp_only_layers=None, - rope_scaling=None, - head_dim=None, + vocab_size: Optional[int] = 151936, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 5632, + num_hidden_layers: Optional[int] = 24, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 16, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 128000, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + decoder_sparse_step: Optional[int] = 1, + moe_intermediate_size: Optional[int] = 1408, + num_experts_per_tok: Optional[int] = 4, + num_experts: Optional[int] = 60, + mlp_only_layers: Optional[list[int]] = None, + rope_parameters: Optional[RopeParameters] = None, + head_dim: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -226,12 +184,16 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout - self.rope_scaling = rope_scaling self.head_dim = head_dim or hidden_size // num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 5000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments @@ -239,8 +201,6 @@ def __init__( self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok self.num_experts = num_experts - self.norm_topk_prob = norm_topk_prob - self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 77a7cae6aa4f..3a3aca4ddacd 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -14,7 +14,10 @@ # limitations under the License. """RecurrentGemma model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -74,8 +77,10 @@ class RecurrentGemmaConfig(PreTrainedConfig): The hidden activation used in the recurrent block as well as the MLP layer of the decoder layers. partial_rotary_factor (`float`, *optional*, defaults to 0.5): The partial rotary factor used in the initialization of the rotary embeddings. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. block_types (`list[str]`, *optional*, defaults to `('recurrent', 'recurrent', 'attention')`): List of aleternating blocks that will be repeated to initialize the `temporal_block` layer. attention_dropout (`float`, *optional*, defaults to 0.0): dropout value to use after the attention softmax. @@ -99,28 +104,28 @@ class RecurrentGemmaConfig(PreTrainedConfig): def __init__( self, - num_hidden_layers=26, - vocab_size=256000, - hidden_size=2560, - intermediate_size=3 * 2560, - num_attention_heads=10, - lru_width=None, - attention_window_size=2048, - conv1d_width=4, - logits_soft_cap=30.0, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - hidden_activation="gelu_pytorch_tanh", - partial_rotary_factor=0.5, - rope_theta=10000.0, - block_types=("recurrent", "recurrent", "attention"), - attention_dropout=0.0, - num_key_value_heads=None, - attention_bias=False, - w_init_variance_scale=0.01, + num_hidden_layers: Optional[int] = 26, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2560, + intermediate_size: Optional[int] = 3 * 2560, + num_attention_heads: Optional[int] = 10, + lru_width: Optional[int] = None, + attention_window_size: Optional[int] = 2048, + conv1d_width: Optional[int] = 4, + logits_soft_cap: Optional[float] = 30.0, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + partial_rotary_factor: Optional[float] = 0.5, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"), + attention_dropout: Optional[float] = 0.0, + num_key_value_heads: Optional[int] = None, + attention_bias: Optional[str] = False, + w_init_variance_scale: Optional[float] = 0.01, **kwargs, ): self.num_hidden_layers = num_hidden_layers @@ -134,7 +139,6 @@ def __init__( self.logits_soft_cap = logits_soft_cap self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.partial_rotary_factor = partial_rotary_factor self.block_types = list(block_types) self.hidden_activation = hidden_activation @@ -146,6 +150,15 @@ def __init__( self.attention_bias = attention_bias self.w_init_variance_scale = w_init_variance_scale self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index 9abff584c789..3b6041c9d046 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -16,6 +16,7 @@ """PyTorch RecurrentGemma model.""" import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -26,6 +27,7 @@ from ...modeling_attn_mask_utils import AttentionMaskConverter from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput +from ...modeling_rope_utils import dynamic_rope_update from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, logging from ...utils.import_utils import is_torchdynamo_compiling @@ -57,31 +59,73 @@ def extra_repr(self): return f"{tuple(self.weight.shape)}, eps={self.eps}" +# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->RecurrentGemma class RecurrentGemmaRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, dim, base=10000, device=None): + # Ignore copy + def __init__(self, config: RecurrentGemmaConfig, device=None): super().__init__() - self.dim = dim - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) - self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + raise ValueError( + f"RecurrentGemmaRotaryEmbedding does not support RoPE types other than `default` but got {self.rope_type}" + ) + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + # Ignore copy + def compute_default_rope_parameters( + config: Optional[RecurrentGemmaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() - def forward(self, x, position_ids, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - self.inv_freq.to(x.device) - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 - device_type = x.device.type - device_type = device_type if device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() - sin = emb.sin() + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) @@ -152,10 +196,7 @@ def __init__(self, config: RecurrentGemmaConfig): self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.o_proj = nn.Linear(self.num_attention_heads * self.head_dim, self.hidden_size, bias=True) - self.rotary_emb = RecurrentGemmaRotaryEmbedding( - int(self.partial_rotary_factor * self.head_dim), - base=config.rope_theta, - ) + self.rotary_emb = RecurrentGemmaRotaryEmbedding(config=config) def forward( self, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index d4ac29378813..7961646ae2d8 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -13,8 +13,10 @@ # limitations under the License. """SeedOss model configuration""" +from typing import Optional + from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import rope_config_validation +from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class SeedOssConfig(PreTrainedConfig): @@ -71,45 +73,10 @@ class SeedOssConfig(PreTrainedConfig): results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, *optional*, defaults to `True`): Whether to use a bias in the query, key, value layers during self-attention. attention_out_bias (`bool`, *optional*, defaults to `False`): @@ -156,30 +123,29 @@ class SeedOssConfig(PreTrainedConfig): def __init__( self, - vocab_size=155136, - hidden_size=4096, - intermediate_size=27648, - num_hidden_layers=64, - num_attention_heads=80, - num_key_value_heads=8, - hidden_act="silu", - max_position_embeddings=524288, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=1, - bos_token_id=0, - eos_token_id=2, - pretraining_tp=1, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=True, - attention_out_bias=False, - attention_dropout=0.1, - residual_dropout=0.1, - mlp_bias=False, - head_dim=128, + vocab_size: Optional[int] = 155136, + hidden_size: Optional[int] = 4096, + intermediate_size: Optional[int] = 27648, + num_hidden_layers: Optional[int] = 64, + num_attention_heads: Optional[int] = 80, + num_key_value_heads: Optional[int] = 8, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 524288, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[float] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 2, + pretraining_tp: Optional[int] = 1, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = True, + attention_out_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.1, + residual_dropout: Optional[float] = 0.1, + mlp_bias: Optional[bool] = False, + head_dim: Optional[int] = 128, **kwargs, ): self.vocab_size = vocab_size @@ -198,18 +164,19 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.pretraining_tp = pretraining_tp self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_out_bias = attention_out_bias self.attention_dropout = attention_dropout self.residual_dropout = residual_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, copy it it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/seed_oss/modeling_seed_oss.py b/src/transformers/models/seed_oss/modeling_seed_oss.py index 73b5a56616f5..7e645e3ce052 100644 --- a/src/transformers/models/seed_oss/modeling_seed_oss.py +++ b/src/transformers/models/seed_oss/modeling_seed_oss.py @@ -248,7 +248,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -298,20 +298,49 @@ class SeedOssRotaryEmbedding(nn.Module): def __init__(self, config: SeedOssConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[SeedOssConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -388,16 +417,16 @@ def forward( ) hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask, + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py b/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py index 057d59f569ac..d5b31541ac0c 100644 --- a/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py +++ b/src/transformers/models/shieldgemma2/convert_shieldgemma2_weights_orbax_to_hf.py @@ -406,7 +406,7 @@ def main(*args): num_hidden_layers=34, num_key_value_heads=4, sliding_window=1024, - rope_scaling={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only + rope_parameters={"rope_type": "linear", "factor": 8.0}, # used for global RoPE only rope_theta=1_000_000, rope_local_base_freq=10_000, attn_logit_softcapping=None, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 1f4c665460f1..2ffdf53008c6 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -19,8 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class SmolLM3Config(PreTrainedConfig): @@ -69,45 +71,10 @@ class SmolLM3Config(PreTrainedConfig): The id of the beginning of sentence token. eos_token_id (`int`, *optional*, defaults to 128001): The id of the end of sentence token. - rope_theta (`float`, *optional*, defaults to 2000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*): @@ -159,30 +126,29 @@ class SmolLM3Config(PreTrainedConfig): def __init__( self, - vocab_size=128256, - hidden_size=2048, - intermediate_size=11008, - num_hidden_layers=36, - num_attention_heads=16, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=128004, - bos_token_id=128000, - eos_token_id=128001, - rope_theta=2000000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=None, - no_rope_layers=None, - no_rope_layer_interval=4, - layer_types=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, + vocab_size: Optional[int] = 128256, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 36, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 128004, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128001, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = None, + no_rope_layers: Optional[int] = None, + no_rope_layer_interval: Optional[int] = 4, + layer_types: Optional[int] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, **kwargs, ): super().__init__( @@ -210,8 +176,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -238,9 +202,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = getattr(self, "rope_theta", 2000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) diff --git a/src/transformers/models/smollm3/modeling_smollm3.py b/src/transformers/models/smollm3/modeling_smollm3.py index 44f2dcee0bdf..e11c1138b490 100644 --- a/src/transformers/models/smollm3/modeling_smollm3.py +++ b/src/transformers/models/smollm3/modeling_smollm3.py @@ -46,6 +46,71 @@ from .configuration_smollm3 import SmolLM3Config +class SmolLM3RotaryEmbedding(nn.Module): + inv_freq: torch.Tensor # fix linting for `register_buffer` + + def __init__(self, config: SmolLM3Config, device=None): + super().__init__() + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[SmolLM3Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + def rotate_half(x): """Rotates half the hidden dims of the input.""" x1 = x[..., : x.shape[-1] // 2] @@ -253,7 +318,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -298,42 +363,6 @@ class SmolLM3PreTrainedModel(PreTrainedModel): } -class SmolLM3RotaryEmbedding(nn.Module): - inv_freq: torch.Tensor # fix linting for `register_buffer` - - def __init__(self, config: SmolLM3Config, device=None): - super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" - self.max_seq_len_cached = config.max_position_embeddings - self.original_max_seq_len = config.max_position_embeddings - - self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq - - @torch.no_grad() - @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) - def forward(self, x, position_ids): - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) - position_ids_expanded = position_ids[:, None, :].float() - - device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): # Force float32 - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - cos = emb.cos() * self.attention_scaling - sin = emb.sin() * self.attention_scaling - - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) - - @auto_docstring class SmolLM3Model(SmolLM3PreTrainedModel): def __init__(self, config: SmolLM3Config): @@ -404,19 +433,17 @@ def forward( causal_mask_mapping["sliding_attention"] = create_sliding_window_causal_mask(**mask_kwargs) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, use_cache=use_cache, cache_position=cache_position, - position_embeddings=position_embeddings, **kwargs, ) diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index 436d8512613f..fe8bcb52080d 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -36,7 +36,7 @@ apply_rotary_pos_emb, eager_attention_forward, ) -from ..qwen2.modeling_qwen2 import Qwen2Model +from ..qwen2.modeling_qwen2 import Qwen2Model, Qwen2RotaryEmbedding logger = logging.get_logger(__name__) @@ -88,45 +88,10 @@ class SmolLM3Config(PreTrainedConfig): The id of the beginning of sentence token. eos_token_id (`int`, *optional*, defaults to 128001): The id of the end of sentence token. - rope_theta (`float`, *optional*, defaults to 2000000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`List[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_sliding_window (`bool`, *optional*, defaults to `False`): Whether to use sliding window attention. sliding_window (`int`, *optional*): @@ -178,30 +143,29 @@ class SmolLM3Config(PreTrainedConfig): def __init__( self, - vocab_size=128256, - hidden_size=2048, - intermediate_size=11008, - num_hidden_layers=36, - num_attention_heads=16, - num_key_value_heads=4, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=128004, - bos_token_id=128000, - eos_token_id=128001, - rope_theta=2000000.0, - rope_scaling=None, - use_sliding_window=False, - sliding_window=None, - no_rope_layers=None, - no_rope_layer_interval=4, - layer_types=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, + vocab_size: Optional[int] = 128256, + hidden_size: Optional[int] = 2048, + intermediate_size: Optional[int] = 11008, + num_hidden_layers: Optional[int] = 36, + num_attention_heads: Optional[int] = 16, + num_key_value_heads: Optional[int] = 4, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 32768, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 128004, + bos_token_id: Optional[int] = 128000, + eos_token_id: Optional[int] = 128001, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_sliding_window: Optional[bool] = False, + sliding_window: Optional[int] = None, + no_rope_layers: Optional[int] = None, + no_rope_layer_interval: Optional[int] = 4, + layer_types: Optional[int] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + mlp_bias: Optional[bool] = False, **kwargs, ): super().__init__( @@ -229,8 +193,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -257,12 +219,15 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = getattr(self, "rope_theta", 2000000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) +class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): + pass + + class SmolLM3Attention(LlamaAttention): def __init__(self, config: SmolLM3Config, layer_idx: int): super().__init__(config, layer_idx) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 37725fefda70..9beed377ad69 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -14,8 +14,10 @@ # limitations under the License. """StableLM model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -69,45 +71,10 @@ class StableLmConfig(PreTrainedConfig): (not used by all models). Only relevant if `config.is_decoder=True`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - rope_theta (`float`, *optional*, defaults to `10000.0`): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. use_qkv_bias (`bool`, *optional*, defaults to `False`): Whether or not the model should use bias for qkv layers. qk_layernorm (`bool`, *optional*, defaults to `False`): @@ -140,28 +107,27 @@ class StableLmConfig(PreTrainedConfig): def __init__( self, - vocab_size=50304, - intermediate_size=6912, - hidden_size=2560, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - layer_norm_eps=1.0e-5, - use_cache=True, - tie_word_embeddings=False, - rope_theta=10_000, - rope_scaling=None, - use_qkv_bias=False, - qk_layernorm=False, - use_parallel_residual=False, - hidden_dropout=0.0, - attention_dropout=0.0, - partial_rotary_factor=0.25, - bos_token_id=0, - eos_token_id=0, + vocab_size: Optional[int] = 50304, + intermediate_size: Optional[int] = 6912, + hidden_size: Optional[int] = 2560, + num_hidden_layers: Optional[int] = 32, + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = 32, + hidden_act: Optional[str] = "silu", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.02, + layer_norm_eps: Optional[float] = 1.0e-5, + use_cache: Optional[bool] = True, + tie_word_embeddings: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + use_qkv_bias: Optional[bool] = False, + qk_layernorm: Optional[bool] = False, + use_parallel_residual: Optional[bool] = False, + hidden_dropout: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + partial_rotary_factor: Optional[float] = 0.25, + bos_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 0, **kwargs, ): self.vocab_size = vocab_size @@ -177,18 +143,19 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.use_qkv_bias = use_qkv_bias self.qk_layernorm = qk_layernorm self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.partial_rotary_factor = partial_rotary_factor + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 17ea7c59fd36..6698273cfae3 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -19,7 +19,7 @@ # limitations under the License. """PyTorch StableLM model.""" -import math +from collections.abc import Callable from typing import Optional, Union import torch @@ -29,7 +29,6 @@ from ...cache_utils import Cache, DynamicCache from ...generation import GenerationMixin from ...modeling_attn_mask_utils import AttentionMaskConverter -from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available from ...modeling_layers import ( GenericForSequenceClassification, GenericForTokenClassification, @@ -39,9 +38,13 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update -from ...modeling_utils import PreTrainedModel -from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging +from ...modeling_rope_utils import ( + ROPE_INIT_FUNCTIONS, + dynamic_rope_update, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging from .configuration_stablelm import StableLmConfig @@ -51,11 +54,6 @@ from ...integrations.flex_attention import make_flex_block_causal_mask -if is_flash_attn_available(): - from ...integrations.flash_attention import get_target_dtype - from ...modeling_flash_attention_utils import _flash_attention_forward - - logger = logging.get_logger(__name__) @@ -65,20 +63,52 @@ class StableLmRotaryEmbedding(nn.Module): def __init__(self, config: StableLmConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + # Ignore copy + def compute_default_rope_parameters( + config: Optional[StableLmConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -177,6 +207,33 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) +# Copied from transformers.models.llama.modeling_llama.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs: Unpack[TransformersKwargs], +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + class StableLmAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -196,9 +253,10 @@ def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None): self.head_dim = self.hidden_size // self.num_heads self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.rope_theta = config.rope_theta + self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) self.is_causal = True + self.scaling = self.head_dim**-0.5 if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -217,8 +275,7 @@ def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None): self.head_dim, self.num_key_value_heads, eps=config.layer_norm_eps ) - self.attention_dropout = nn.Dropout(config.attention_dropout) - self.rotary_emb = StableLmRotaryEmbedding(config=self.config) + self.attention_dropout = config.attention_dropout def forward( self, @@ -229,7 +286,8 @@ def forward( output_attentions: bool = False, use_cache: bool = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + **kwargs, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: bsz, q_len, _ = hidden_states.size() @@ -246,8 +304,6 @@ def forward( key_states = self.k_layernorm(key_states) cos, sin = position_embeddings - - # Partial rotary embedding query_rot, query_pass = ( query_states[..., : self.rotary_ndims], query_states[..., self.rotary_ndims :], @@ -273,266 +329,34 @@ def forward( } key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) - # Repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) - - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - attn_weights += causal_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query_states.dtype) - attn_weights = self.attention_dropout(attn_weights) - - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights - - -class StableLmSdpaAttention(StableLmAttention): - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: - if output_attentions: - # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. - logger.warning_once( - "StableLmModel is using StableLmSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " - 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states=hidden_states, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - use_cache=use_cache, - cache_position=cache_position, - position_embeddings=position_embeddings, - ) - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - if self.qk_layernorm: - query_states = self.q_layernorm(query_states) - key_states = self.k_layernorm(key_states) - - cos, sin = position_embeddings - - # Partial rotary embedding - query_rot, query_pass = ( - query_states[..., : self.rotary_ndims], - query_states[..., self.rotary_ndims :], - ) - key_rot, key_pass = ( - key_states[..., : self.rotary_ndims], - key_states[..., self.rotary_ndims :], - ) - # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor] - query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin) - - # [batch_size, seq_length, num_heads, head_dim] - query_states = torch.cat((query_rot, query_pass), dim=-1) - key_states = torch.cat((key_rot, key_pass), dim=-1) - - if past_key_values is not None: - # Specific to RoPE models with partial rotation - cache_kwargs = { - "sin": sin, - "cos": cos, - "partial_rotation_size": self.rotary_ndims, - "cache_position": cache_position, - } - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # Repeat k/v heads if n_kv_heads < n_heads - key_states = repeat_kv(key_states, self.num_key_value_groups) - value_states = repeat_kv(value_states, self.num_key_value_groups) - - causal_mask = attention_mask - if attention_mask is not None: # no matter the length, we just slice it - causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] - - # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, - # Reference: https://github.com/pytorch/pytorch/issues/112577. - if query_states.device.type == "cuda" and attention_mask is not None: - query_states = query_states.contiguous() - key_states = key_states.contiguous() - value_states = value_states.contiguous() - - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. - is_causal = bool(causal_mask is None and q_len > 1) - - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, - key_states, - value_states, - attn_mask=causal_mask, - dropout_p=self.attention_dropout.p if self.training else 0.0, - is_causal=is_causal, - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - return attn_output, None - - -class StableLmFlashAttention2(StableLmAttention): - """ - StableLM flash attention module. This module inherits from `StableLmAttention` as the weights of the module stays - untouched. The only required change would be on the forward pass where it needs to correctly call the public API of - flash attention and deal with padding tokens in case the input contains any of them. - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. - # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. - # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). - self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.LongTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: bool = False, - use_cache: bool = False, - cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC - **kwargs, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: - # StableLmFlashAttention2 attention does not support output_attentions - - output_attentions = False - - bsz, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - - if self.qk_layernorm: - query_states = self.q_layernorm(query_states) - key_states = self.k_layernorm(key_states) - - cos, sin = position_embeddings - - # Partial rotary embedding - query_rot, query_pass = ( - query_states[..., : self.rotary_ndims], - query_states[..., self.rotary_ndims :], - ) - key_rot, key_pass = ( - key_states[..., : self.rotary_ndims], - key_states[..., self.rotary_ndims :], - ) - query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin) - - # [batch_size, seq_length, num_heads, head_dim] - query_states = torch.cat((query_rot, query_pass), dim=-1) - key_states = torch.cat((key_rot, key_pass), dim=-1) - - if past_key_values is not None: - cache_kwargs = { - "sin": sin, - "cos": cos, - "partial_rotation_size": self.rotary_ndims, - "cache_position": cache_position, - } - key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs) - - # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache - # to be able to avoid many of these transpose/reshape/view. - query_states = query_states.transpose(1, 2) - key_states = key_states.transpose(1, 2) - value_states = value_states.transpose(1, 2) - - dropout_rate = self.attention_dropout.p if self.training else 0.0 - - target_dtype = get_target_dtype(query_states, self) - - attn_output = _flash_attention_forward( + attn_output, attn_weights = attention_interface( + self, query_states, key_states, value_states, attention_mask, - q_len, - position_ids=position_ids, - dropout=dropout_rate, - use_top_left_mask=self._flash_attn_uses_top_left_mask, - is_causal=self.is_causal, - target_dtype=target_dtype, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + position_ids=position_ids, # pass `position_ids` for FA2 + **kwargs, ) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = attn_output.reshape(bsz, q_len, -1) attn_output = self.o_proj(attn_output) - if not output_attentions: - attn_weights = None - return attn_output, attn_weights -ATTENTION_CLASSES = { - "eager": StableLmAttention, - "sdpa": StableLmSdpaAttention, - "flash_attention_2": StableLmFlashAttention2, -} - - class StableLmDecoderLayer(GradientCheckpointingLayer): def __init__(self, config: StableLmConfig, layer_idx: int): super().__init__() self.use_parallel_residual = config.use_parallel_residual self.hidden_size = config.hidden_size - self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + self.self_attn = StableLmAttention(config, layer_idx=layer_idx) self.mlp = StableLmMLP(config) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.post_attention_layernorm = None @@ -549,7 +373,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -662,10 +486,11 @@ def __init__(self, config: StableLmConfig): [StableLmDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.rotary_emb = StableLmRotaryEmbedding(config=config) self._attn_implementation = config._attn_implementation self.gradient_checkpointing = False + self.rotary_emb = StableLmRotaryEmbedding(config=self.config) + # Initialize weights and apply final processing self.post_init() @@ -718,9 +543,7 @@ def forward( ) hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 8e1872cfba6c..9d87dd6eefa1 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -14,8 +14,10 @@ # limitations under the License. """Starcoder2 model configuration""" +from typing import Optional + from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params from ...utils import logging @@ -68,45 +70,10 @@ class Starcoder2Config(PreTrainedConfig): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 50256): The id of the "end-of-sequence" token. - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type - and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value - accordingly. - Expected contents: - `rope_type` (`str`): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - `factor` (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - `original_max_position_embeddings` (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - `attention_factor` (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - `beta_fast` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - `beta_slow` (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - `short_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `long_factor` (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - `low_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - `high_freq_factor` (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. sliding_window (`int`, *optional*): Sliding window attention window size. If not specified, will default to `None` (no sliding window). attention_dropout (`float`, *optional*, defaults to 0.0): @@ -151,26 +118,25 @@ class Starcoder2Config(PreTrainedConfig): def __init__( self, - vocab_size=49152, - hidden_size=3072, - intermediate_size=12288, - num_hidden_layers=30, - num_attention_heads=24, - num_key_value_heads=2, - hidden_act="gelu_pytorch_tanh", - max_position_embeddings=4096, - initializer_range=0.018042, - norm_epsilon=1e-5, - use_cache=True, - bos_token_id=50256, - eos_token_id=50256, - rope_theta=10000.0, - rope_scaling=None, - sliding_window=None, - attention_dropout=0.0, - residual_dropout=0.0, - embedding_dropout=0.0, - use_bias=True, + vocab_size: Optional[int] = 49152, + hidden_size: Optional[int] = 3072, + intermediate_size: Optional[int] = 12288, + num_hidden_layers: Optional[int] = 30, + num_attention_heads: Optional[int] = 24, + num_key_value_heads: Optional[int] = 2, + hidden_act: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 4096, + initializer_range: Optional[float] = 0.018042, + norm_epsilon: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + bos_token_id: Optional[int] = 50256, + eos_token_id: Optional[int] = 50256, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + sliding_window: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + residual_dropout: Optional[float] = 0.0, + embedding_dropout: Optional[float] = 0.0, + use_bias: Optional[bool] = True, **kwargs, ): self.vocab_size = vocab_size @@ -186,15 +152,16 @@ def __init__( self.initializer_range = initializer_range self.norm_epsilon = norm_epsilon self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout self.residual_dropout = residual_dropout self.embedding_dropout = embedding_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_scaling is not None and "type" in self.rope_scaling: - self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self) super().__init__( diff --git a/src/transformers/models/starcoder2/modeling_starcoder2.py b/src/transformers/models/starcoder2/modeling_starcoder2.py index b1efca6bcea4..6b93c18a3d17 100644 --- a/src/transformers/models/starcoder2/modeling_starcoder2.py +++ b/src/transformers/models/starcoder2/modeling_starcoder2.py @@ -223,7 +223,7 @@ def forward( past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, - position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, **kwargs: Unpack[TransformersKwargs], ) -> torch.Tensor: residual = hidden_states @@ -249,25 +249,73 @@ def forward( return hidden_states +@auto_docstring +class Starcoder2PreTrainedModel(PreTrainedModel): + config: Starcoder2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Starcoder2DecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn = True + _supports_sdpa = True + _supports_flex_attn = True + + _can_compile_fullgraph = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": Starcoder2DecoderLayer, + "attentions": Starcoder2Attention, + } + + class Starcoder2RotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` def __init__(self, config: Starcoder2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Starcoder2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -285,25 +333,6 @@ def forward(self, x, position_ids): return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) -@auto_docstring -class Starcoder2PreTrainedModel(PreTrainedModel): - config: Starcoder2Config - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Starcoder2DecoderLayer"] - _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = True - _supports_sdpa = True - _supports_flex_attn = True - - _can_compile_fullgraph = True - _supports_attention_backend = True - _can_record_outputs = { - "hidden_states": Starcoder2DecoderLayer, - "attentions": Starcoder2Attention, - } - - @auto_docstring class Starcoder2Model(Starcoder2PreTrainedModel): def __init__(self, config: Starcoder2Config): @@ -367,9 +396,8 @@ def forward( hidden_states = nn.functional.dropout( hidden_states, p=self.embedding_dropout, training=self.training ) # main diff with Llama + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, diff --git a/src/transformers/models/starcoder2/modular_starcoder2.py b/src/transformers/models/starcoder2/modular_starcoder2.py index 18b584998631..2c0a27f81bdf 100644 --- a/src/transformers/models/starcoder2/modular_starcoder2.py +++ b/src/transformers/models/starcoder2/modular_starcoder2.py @@ -42,7 +42,6 @@ MistralForSequenceClassification, MistralForTokenClassification, MistralModel, - MistralRotaryEmbedding, apply_rotary_pos_emb, eager_attention_forward, ) @@ -136,10 +135,6 @@ def __init__(self, config: Starcoder2Config, layer_idx: int): self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon) -class Starcoder2RotaryEmbedding(MistralRotaryEmbedding): - pass - - class Starcoder2Model(MistralModel): def __init__(self, config: Starcoder2Config): super().__init__(config) @@ -193,9 +188,8 @@ def forward( hidden_states = nn.functional.dropout( hidden_states, p=self.embedding_dropout, training=self.training ) # main diff with Llama + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) - # create position embeddings to be shared across the decoder layers - position_embeddings = self.rotary_emb(hidden_states, position_ids) for decoder_layer in self.layers[: self.config.num_hidden_layers]: hidden_states = decoder_layer( hidden_states, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 494e8b757a89..bea8916d3e6b 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,6 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class T5GemmaModuleConfig(PreTrainedConfig): @@ -75,8 +76,10 @@ class T5GemmaModuleConfig(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -121,30 +124,30 @@ class T5GemmaModuleConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): super().__init__( @@ -165,7 +168,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -174,6 +176,9 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -181,6 +186,11 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + class T5GemmaConfig(PreTrainedConfig): r""" @@ -255,12 +265,12 @@ def __init__( self, encoder: Optional[Union[T5GemmaModuleConfig, dict[Any, Any]]] = None, decoder: Optional[Union[T5GemmaModuleConfig, dict[Any, Any]]] = None, - is_encoder_decoder: bool = True, - dropout_rate: float = 0.0, - classifier_dropout_rate: float = 0.0, - attention_dropout: float = 0.0, - tie_word_embeddings: bool = True, - vocab_size: int = 256000, + is_encoder_decoder: Optional[bool] = True, + dropout_rate: Optional[float] = 0.0, + classifier_dropout_rate: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + tie_word_embeddings: Optional[bool] = True, + vocab_size: Optional[int] = 256000, **kwargs, ): if isinstance(encoder, dict): diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index 223cda69845c..864fd7b50cba 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -92,22 +92,51 @@ def forward(self, x): class T5GemmaRotaryEmbedding(nn.Module): inv_freq: torch.Tensor # fix linting for `register_buffer` - def __init__(self, config, device=None): + def __init__(self, config: T5GemmaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[T5GemmaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -211,6 +240,7 @@ class T5GemmaSelfAttention(nn.Module): def __init__(self, config: T5GemmaModuleConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -233,13 +263,13 @@ def __init__(self, config: T5GemmaModuleConfig, layer_idx: int): config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -392,7 +422,7 @@ def __init__(self, config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, **kwargs, @@ -430,7 +460,7 @@ def __init__(self, config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[EncoderDecoderCache] = None, @@ -622,13 +652,13 @@ def __init__(self, config): self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.norm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = T5GemmaRotaryEmbedding(config=config) self.gradient_checkpointing = False self.layers = nn.ModuleList( [T5GemmaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.dropout = nn.Dropout(config.dropout_rate) + self.rotary_emb = T5GemmaRotaryEmbedding(config=config) # Initialize weights and apply final processing self.post_init() @@ -681,12 +711,12 @@ def forward( } hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer hidden_states = self.dropout(hidden_states) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, @@ -714,6 +744,7 @@ def __init__(self, config): self.layers = nn.ModuleList( [T5GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) + self.rotary_emb = T5GemmaRotaryEmbedding(config=config) self.post_init() @@ -784,12 +815,12 @@ def forward( } hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer hidden_states = self.dropout(hidden_states) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index db2c9a804aff..6cda2eb063e3 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -32,6 +32,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import ( @@ -111,8 +112,10 @@ class T5GemmaModuleConfig(Gemma2Config): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -140,30 +143,30 @@ class T5GemmaModuleConfig(Gemma2Config): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): super().__init__( @@ -183,7 +186,7 @@ def __init__( eos_token_id=eos_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, query_pre_attn_scalar=query_pre_attn_scalar, @@ -270,12 +273,12 @@ def __init__( self, encoder: Optional[Union[T5GemmaModuleConfig, dict[Any, Any]]] = None, decoder: Optional[Union[T5GemmaModuleConfig, dict[Any, Any]]] = None, - is_encoder_decoder: bool = True, - dropout_rate: float = 0.0, - classifier_dropout_rate: float = 0.0, - attention_dropout: float = 0.0, - tie_word_embeddings: bool = True, - vocab_size: int = 256000, + is_encoder_decoder: Optional[bool] = True, + dropout_rate: Optional[float] = 0.0, + classifier_dropout_rate: Optional[float] = 0.0, + attention_dropout: Optional[float] = 0.0, + tie_word_embeddings: Optional[bool] = True, + vocab_size: Optional[int] = 256000, **kwargs, ): if isinstance(encoder, dict): @@ -357,8 +360,7 @@ def forward(self, x): class T5GemmaRotaryEmbedding(Gemma2RotaryEmbedding): - def __init__(self, config, device=None): - super().__init__(config, device) + pass class T5GemmaSelfAttention(Gemma2Attention): @@ -372,6 +374,7 @@ class T5GemmaCrossAttention(Gemma2Attention): def __init__(self, config: T5GemmaModuleConfig, layer_idx: int): super().__init__(config, layer_idx) del self.sliding_window + del self.layer_type self.is_causal = False if config.cross_attention_hidden_size is None: @@ -488,7 +491,7 @@ def __init__(self, config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, **kwargs, @@ -526,7 +529,7 @@ def __init__(self, config, layer_idx: int): def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[EncoderDecoderCache] = None, @@ -687,13 +690,13 @@ def __init__(self, config): self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) self.norm = T5GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = T5GemmaRotaryEmbedding(config=config) self.gradient_checkpointing = False self.layers = nn.ModuleList( [T5GemmaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.dropout = nn.Dropout(config.dropout_rate) + self.rotary_emb = T5GemmaRotaryEmbedding(config=config) # Initialize weights and apply final processing self.post_init() @@ -746,12 +749,12 @@ def forward( } hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer hidden_states = self.dropout(hidden_states) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, @@ -779,6 +782,7 @@ def __init__(self, config): self.layers = nn.ModuleList( [T5GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) + self.rotary_emb = T5GemmaRotaryEmbedding(config=config) self.post_init() @@ -849,12 +853,12 @@ def forward( } hidden_states = inputs_embeds - position_embeddings = self.rotary_emb(hidden_states, position_ids) - normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype) hidden_states = hidden_states * normalizer hidden_states = self.dropout(hidden_states) + position_embeddings = self.rotary_emb(hidden_states, position_ids) + for layer_module in self.layers[: self.config.num_hidden_layers]: hidden_states = layer_module( hidden_states, diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 9536eb029e8e..d50cf5ed93d7 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -19,7 +19,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig, layer_type_validation +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class VaultGemmaConfig(PreTrainedConfig): @@ -73,8 +76,10 @@ class VaultGemmaConfig(PreTrainedConfig): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -119,30 +124,30 @@ class VaultGemmaConfig(PreTrainedConfig): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): super().__init__( @@ -163,7 +168,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.hidden_activation = hidden_activation @@ -172,6 +176,9 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -179,5 +186,10 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/vaultgemma/modeling_vaultgemma.py b/src/transformers/models/vaultgemma/modeling_vaultgemma.py index 9b4934af5b58..51071e59997b 100644 --- a/src/transformers/models/vaultgemma/modeling_vaultgemma.py +++ b/src/transformers/models/vaultgemma/modeling_vaultgemma.py @@ -165,6 +165,7 @@ class VaultGemmaAttention(nn.Module): def __init__(self, config: VaultGemmaConfig, layer_idx: int): super().__init__() + self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None self.config = config self.layer_idx = layer_idx self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) @@ -186,13 +187,13 @@ def __init__(self, config: VaultGemmaConfig, layer_idx: int): config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias ) self.attn_logit_softcapping = self.config.attn_logit_softcapping - self.sliding_window = config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None + self.sliding_window = config.sliding_window if self.layer_type == "sliding_attention" else None def forward( self, hidden_states: torch.Tensor, - position_embeddings: tuple[torch.Tensor, torch.Tensor], - attention_mask: Optional[torch.Tensor], + position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Cache] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs: Unpack[FlashAttentionKwargs], @@ -291,20 +292,49 @@ class VaultGemmaRotaryEmbedding(nn.Module): def __init__(self, config: VaultGemmaConfig, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[VaultGemmaConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -360,7 +390,7 @@ def __init__(self, config: VaultGemmaConfig): [VaultGemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) self.norm = VaultGemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.rotary_emb = VaultGemmaRotaryEmbedding(config=config) + self.rotary_emb = VaultGemmaRotaryEmbedding(config) self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -430,8 +460,6 @@ def forward( # embed positions hidden_states = inputs_embeds - - # create position embeddings to be shared across the decoder layers position_embeddings = self.rotary_emb(hidden_states, position_ids) # normalized @@ -450,8 +478,8 @@ def forward( layer_outputs = decoder_layer( hidden_states, - position_embeddings=position_embeddings, attention_mask=causal_mask_mapping[decoder_layer.attention_type], + position_embeddings=position_embeddings, position_ids=position_ids, past_key_values=past_key_values, output_attentions=output_attentions, diff --git a/src/transformers/models/vaultgemma/modular_vaultgemma.py b/src/transformers/models/vaultgemma/modular_vaultgemma.py index 0a6531179053..e8b3a4ee6773 100644 --- a/src/transformers/models/vaultgemma/modular_vaultgemma.py +++ b/src/transformers/models/vaultgemma/modular_vaultgemma.py @@ -18,6 +18,7 @@ import torch from ...cache_utils import Cache +from ...modeling_rope_utils import RopeParameters from ..gemma2.configuration_gemma2 import Gemma2Config from ..gemma2.modeling_gemma2 import Gemma2Attention, Gemma2DecoderLayer, Gemma2ForCausalLM, Gemma2MLP, Gemma2RMSNorm @@ -73,8 +74,10 @@ class VaultGemmaConfig(Gemma2Config): Beginning of stream token id. tie_word_embeddings (`bool`, *optional*, defaults to `True`): Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -102,30 +105,30 @@ class VaultGemmaConfig(Gemma2Config): def __init__( self, - vocab_size=256000, - hidden_size=2304, - intermediate_size=9216, - num_hidden_layers=26, - num_attention_heads=8, - num_key_value_heads=4, - head_dim=256, - hidden_activation="gelu_pytorch_tanh", - max_position_embeddings=8192, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=0, - eos_token_id=1, - bos_token_id=2, - tie_word_embeddings=True, - rope_theta=10000.0, - attention_bias=False, - attention_dropout=0.0, - query_pre_attn_scalar=256, - sliding_window=4096, - layer_types=None, - final_logit_softcapping=30.0, - attn_logit_softcapping=50.0, + vocab_size: Optional[int] = 256000, + hidden_size: Optional[int] = 2304, + intermediate_size: Optional[int] = 9216, + num_hidden_layers: Optional[int] = 26, + num_attention_heads: Optional[int] = 8, + num_key_value_heads: Optional[int] = 4, + head_dim: Optional[int] = 256, + hidden_activation: Optional[str] = "gelu_pytorch_tanh", + max_position_embeddings: Optional[int] = 8192, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-6, + use_cache: Optional[bool] = True, + pad_token_id: Optional[int] = 0, + eos_token_id: Optional[int] = 1, + bos_token_id: Optional[int] = 2, + tie_word_embeddings: Optional[bool] = True, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + attention_bias: Optional[bool] = False, + attention_dropout: Optional[float] = 0.0, + query_pre_attn_scalar: Optional[int] = 256, + sliding_window: Optional[int] = 4096, + layer_types: Optional[list[str]] = None, + final_logit_softcapping: Optional[float] = 30.0, + attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): super().__init__( @@ -145,7 +148,7 @@ def __init__( eos_token_id=eos_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_theta=rope_theta, + rope_parameters=rope_parameters, attention_bias=attention_bias, attention_dropout=attention_dropout, query_pre_attn_scalar=query_pre_attn_scalar, diff --git a/src/transformers/models/zamba/modeling_zamba.py b/src/transformers/models/zamba/modeling_zamba.py index 57f7bac835af..a144fbd589cf 100644 --- a/src/transformers/models/zamba/modeling_zamba.py +++ b/src/transformers/models/zamba/modeling_zamba.py @@ -657,6 +657,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, transformer_hidden_states: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 9023c9137dad..40e30822cf59 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -20,7 +20,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional + from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params class Zamba2Config(PreTrainedConfig): @@ -91,8 +94,10 @@ class Zamba2Config(PreTrainedConfig): Rank of the adapter in the shared MLP and shared attention layers. use_mem_rope (`bool`, *optional*, defaults to `False`): If True, includes RoPE in the shared attention layers. - rope_theta (`float`, *optional*, defaults to `10000.0`): - The base period of the RoPE embeddings. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. rms_norm_eps (`float`, *optional*, defaults to 1e-05): @@ -130,42 +135,42 @@ class Zamba2Config(PreTrainedConfig): def __init__( self, - vocab_size=32000, - max_position_embeddings=4096, - hidden_size=2560, - num_hidden_layers=54, - layers_block_type=None, - mamba_d_state=64, - mamba_d_conv=4, - mamba_expand=2, - mamba_ngroups=1, - time_step_min=0.001, - time_step_max=0.1, - time_step_floor=1e-4, - time_step_limit=None, - n_mamba_heads=8, - use_conv_bias=True, - chunk_size=256, - use_mem_eff_path=False, - add_bias_linear=False, - intermediate_size=None, - hidden_act="gelu", - num_attention_heads=32, - num_key_value_heads=None, - attention_dropout=0.0, - num_mem_blocks=1, - use_shared_attention_adapter=False, - adapter_rank=128, - use_mem_rope=False, - rope_theta=10000, - initializer_range=0.02, - rms_norm_eps=1e-5, - use_cache=True, - num_logits_to_keep=1, - pad_token_id=0, - bos_token_id=1, - eos_token_id=2, - use_long_context=False, + vocab_size: Optional[int] = 32000, + max_position_embeddings: Optional[int] = 4096, + hidden_size: Optional[int] = 2560, + num_hidden_layers: Optional[int] = 54, + layers_block_type: Optional[list[str]] = None, + mamba_d_state: Optional[int] = 64, + mamba_d_conv: Optional[int] = 4, + mamba_expand: Optional[int] = 2, + mamba_ngroups: Optional[int] = 1, + time_step_min: Optional[float] = 0.001, + time_step_max: Optional[float] = 0.1, + time_step_floor: Optional[int] = 1e-4, + time_step_limit: Optional[int] = None, + n_mamba_heads: Optional[int] = 8, + use_conv_bias: Optional[bool] = True, + chunk_size: Optional[int] = 256, + use_mem_eff_path: Optional[bool] = False, + add_bias_linear: Optional[bool] = False, + intermediate_size: Optional[int] = None, + hidden_act: Optional[str] = "gelu", + num_attention_heads: Optional[int] = 32, + num_key_value_heads: Optional[int] = None, + attention_dropout: Optional[float] = 0.0, + num_mem_blocks: Optional[int] = 1, + use_shared_attention_adapter: Optional[bool] = False, + adapter_rank: Optional[int] = 128, + use_mem_rope: Optional[bool] = False, + rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + initializer_range: Optional[float] = 0.02, + rms_norm_eps: Optional[int] = 1e-5, + use_cache: Optional[bool] = True, + num_logits_to_keep: Optional[int] = 1, + pad_token_id: Optional[int] = 0, + bos_token_id: Optional[int] = 1, + eos_token_id: Optional[int] = 2, + use_long_context: Optional[bool] = False, **kwargs, ): super().__init__( @@ -190,10 +195,15 @@ def __init__( self.attention_dropout = attention_dropout self.use_mem_rope = use_mem_rope self.use_long_context = use_long_context - if use_mem_rope and use_long_context: - a = 8 - rope_theta = rope_theta * a ** (self.attention_head_dim / (self.attention_head_dim - 2)) - self.rope_theta = rope_theta + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or rope_parameters + + # Validate the correctness of rotary position embeddings parameters + rope_theta = kwargs.get("rope_theta", 10000.0) + standardize_rope_params(self, rope_theta=rope_theta) + rope_config_validation(self) + self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv self.mamba_expand = mamba_expand diff --git a/src/transformers/models/zamba2/modeling_zamba2.py b/src/transformers/models/zamba2/modeling_zamba2.py index c4b0250f1564..8f6efc7dbe1c 100644 --- a/src/transformers/models/zamba2/modeling_zamba2.py +++ b/src/transformers/models/zamba2/modeling_zamba2.py @@ -210,20 +210,49 @@ class Zamba2RotaryEmbedding(nn.Module): def __init__(self, config: Zamba2Config, device=None): super().__init__() - # BC: "rope_type" was originally "type" - if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict): - self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) - else: - self.rope_type = "default" self.max_seq_len_cached = config.max_position_embeddings self.original_max_seq_len = config.max_position_embeddings self.config = config - self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.rope_type = self.config.rope_parameters["rope_type"] + rope_init_fn: Callable = self.compute_default_rope_parameters + if self.rope_type != "default": + rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + inv_freq, self.attention_scaling = rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.original_inv_freq = inv_freq + + @staticmethod + def compute_default_rope_parameters( + config: Optional[Zamba2Config] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + ) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PreTrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + base = config.rope_parameters["rope_theta"] + dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / ( + base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim) + ) + return inv_freq, attention_factor @torch.no_grad() @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) @@ -392,6 +421,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Zamba2HybridDynamicCache] = None, position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -1046,6 +1076,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, cache_position: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, transformer_hidden_states: Optional[torch.Tensor] = None, **kwargs, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: @@ -1116,6 +1147,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, position_embeddings: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -1145,6 +1177,7 @@ def forward( past_key_values=past_key_values, output_attentions=output_attentions, position_embeddings=position_embeddings, + position_ids=position_ids, ) transformer_hidden_states = layer_outputs[0] @@ -1302,12 +1335,7 @@ def forward( position_ids = cache_position.unsqueeze(0) causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) - - # create position embeddings to be shared across the decoder layers - if self.config.use_mem_rope: - position_embeddings = self.rotary_emb(hidden_states, position_ids) - else: - position_embeddings = None + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -1328,6 +1356,7 @@ def forward( output_attentions, use_cache, position_embeddings, + position_ids, ) else: layer_outputs = layer( @@ -1340,6 +1369,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, position_embeddings=position_embeddings, + position_ids=position_ids, ) hidden_states = layer_outputs[0] diff --git a/src/transformers/models/zamba2/modular_zamba2.py b/src/transformers/models/zamba2/modular_zamba2.py index 0e8a32bbde09..b884e2b38e4a 100644 --- a/src/transformers/models/zamba2/modular_zamba2.py +++ b/src/transformers/models/zamba2/modular_zamba2.py @@ -27,9 +27,7 @@ from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import ( - logging, -) +from ...utils import logging from ...utils.import_utils import ( is_causal_conv1d_available, is_mamba_ssm_available, @@ -234,6 +232,7 @@ def forward( attention_mask: Optional[torch.Tensor] = None, past_key_values: Optional[Zamba2HybridDynamicCache] = None, position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.Tensor] = None, **kwargs: Unpack[FlashAttentionKwargs], ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]: input_shape = hidden_states.shape[:-1] @@ -837,6 +836,7 @@ def forward( output_attentions: Optional[bool] = False, use_cache: Optional[bool] = False, position_embeddings: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]: """ Args: @@ -866,6 +866,7 @@ def forward( past_key_values=past_key_values, output_attentions=output_attentions, position_embeddings=position_embeddings, + position_ids=position_ids, ) transformer_hidden_states = layer_outputs[0] @@ -1069,12 +1070,7 @@ def forward( position_ids = cache_position.unsqueeze(0) causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) - - # create position embeddings to be shared across the decoder layers - if self.config.use_mem_rope: - position_embeddings = self.rotary_emb(hidden_states, position_ids) - else: - position_embeddings = None + position_embeddings = self.rotary_emb(hidden_states, position_ids=position_ids) all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None @@ -1095,6 +1091,7 @@ def forward( output_attentions, use_cache, position_embeddings, + position_ids, ) else: layer_outputs = layer( @@ -1107,6 +1104,7 @@ def forward( output_attentions=output_attentions, use_cache=use_cache, position_embeddings=position_embeddings, + position_ids=position_ids, ) hidden_states = layer_outputs[0] diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index 8d486c041090..5c25223428d6 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -436,7 +436,7 @@ def test_model_rope_scaling_from_config(self, scaling_type): long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"rope_type": "default"} + _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) original_model = self.model_tester_class.base_model_class(config) original_model.to(torch_device) original_model.eval() @@ -444,7 +444,7 @@ def test_model_rope_scaling_from_config(self, scaling_type): original_long_output = original_model(long_input).last_hidden_state set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"rope_type": scaling_type, "factor": 10.0} + _set_config_rope_params(config, {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0}) scaled_model = self.model_tester_class.base_model_class(config) scaled_model.to(torch_device) scaled_model.eval() @@ -496,7 +496,7 @@ def test_model_rope_scaling_frequencies(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - config.rope_scaling = {"rope_type": "default"} + _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) original_rope = rope_class(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) @@ -505,7 +505,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor} + _set_config_rope_params(config, {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0}) linear_scaling_rope = rope_class(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) @@ -519,7 +519,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor} + _set_config_rope_params(config, {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0}) ntk_scaling_rope = rope_class(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) @@ -533,7 +533,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Yarn RoPE scaling # Scaling should be over the entire input - config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor} + _set_config_rope_params(config, {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0}) yarn_scaling_rope = rope_class(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) @@ -601,9 +601,19 @@ def _config_supports_rope_scaling(config: PreTrainedConfig) -> bool: """Returns whether a certain model config supports RoPE scaling parameterization.""" # Has rope_scaling -> model was designed with rope scaling in mind # Has rope_theta (and no rope_scaling) -> probably an older model, but should support rope scaling as well - main_config_has_rope = hasattr(config, "rope_scaling") or hasattr(config, "rope_theta") + main_config_has_rope = hasattr(config, "rope_parameters") sub_config_has_rope = any( - hasattr(getattr(config, sub_config), "rope_scaling") or hasattr(getattr(config, sub_config), "rope_theta") - for sub_config in config.sub_configs.keys() + hasattr(getattr(config, sub_config), "rope_parameters") for sub_config in config.sub_configs.keys() ) return main_config_has_rope or sub_config_has_rope + + +def _set_config_rope_params(config: PreTrainedConfig, rope_params: dict) -> bool: + """Recursively sets RoPE parameters on configs and subconfigs, by duplicating the same RoPE values.""" + config.rope_parameters = rope_params + if any(name in config.__class__.__name__.lower() for name in ["gemma3", "modernbert"]): + config.rope_parameters = {layer_type: config.rope_parameters.copy() for layer_type in config.layer_types} + + for sub_config in config.sub_configs.keys(): + _set_config_rope_params(getattr(config, sub_config), rope_params) + return config diff --git a/tests/models/blt/test_modeling_blt.py b/tests/models/blt/test_modeling_blt.py index 562144dbe792..5f53bd2d4cfb 100644 --- a/tests/models/blt/test_modeling_blt.py +++ b/tests/models/blt/test_modeling_blt.py @@ -18,7 +18,7 @@ import pytest from parameterized import parameterized -from transformers import AutoTokenizer, is_torch_available, set_seed +from transformers import AutoTokenizer, is_torch_available from transformers.testing_utils import ( cleanup, require_read_token, @@ -33,7 +33,6 @@ from ...test_modeling_common import ( TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION, _test_eager_matches_sdpa_inference, - ids_tensor, ) @@ -71,7 +70,7 @@ def __init__( self.max_position_embeddings = 32 self.vocab_size = 32 self.rope_theta = 500000.0 - self.rope_scaling = {"rope_type": "default"} + self.rope_parameters = {"rope_type": "default"} self.rms_norm_eps = 1e-5 self.dropout = 0.0 self.encoder_hash_byte_group_size = [2, 3] @@ -86,7 +85,7 @@ def __init__( "intermediate_size": self.intermediate_size, "max_position_embeddings": self.max_position_embeddings, "rope_theta": self.rope_theta, - "rope_scaling": self.rope_scaling, + "rope_parameters": self.rope_parameters, "hidden_act": self.hidden_act, "rms_norm_eps": self.rms_norm_eps, "dropout": self.dropout, @@ -100,7 +99,7 @@ def __init__( "intermediate_size": self.intermediate_size, "max_position_embeddings": self.max_position_embeddings, "rope_theta": self.rope_theta, - "rope_scaling": self.rope_scaling, + "rope_parameters": self.rope_parameters, "hidden_act": self.hidden_act, "rms_norm_eps": self.rms_norm_eps, "dropout": self.dropout, @@ -116,7 +115,7 @@ def __init__( "intermediate_size": self.intermediate_size, "max_position_embeddings": self.max_position_embeddings, "rope_theta": self.rope_theta, - "rope_scaling": self.rope_scaling, + "rope_parameters": self.rope_parameters, "hidden_act": self.hidden_act, "rms_norm_eps": self.rms_norm_eps, "dropout": self.dropout, @@ -130,7 +129,7 @@ def __init__( "intermediate_size": self.intermediate_size, "max_position_embeddings": self.max_position_embeddings, "rope_theta": self.rope_theta, - "rope_scaling": self.rope_scaling, + "rope_parameters": self.rope_parameters, "hidden_act": self.hidden_act, "rms_norm_eps": self.rms_norm_eps, "dropout": self.dropout, @@ -156,7 +155,7 @@ def get_config(self): encoder_config=self.encoder_config, decoder_config=self.decoder_config, global_config=self.global_config, - rope_scaling=self.rope_scaling, + rope_parameters=self.rope_parameters, tie_word_embeddings=False, ) @@ -223,43 +222,6 @@ def test_eager_matches_sdpa_inference( self, name, torch_dtype, padding_side, use_attention_mask, output_attentions, enable_kernels, atols=atols ) - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - """Override rope scaling from config test to handle Blt's sub-config structure.""" - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = self.model_tester_class.base_model_class(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"rope_type": scaling_type, "factor": 10.0} - # Propagate rope_scaling to sub-configs for Blt - config.encoder_config.rope_scaling = config.rope_scaling - config.decoder_config.rope_scaling = config.rope_scaling - config.global_config.rope_scaling = config.rope_scaling - config.patcher_config.rope_scaling = config.rope_scaling - - scaled_model = self.model_tester_class.base_model_class(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_torch_accelerator class BltIntegrationTest(unittest.TestCase): diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index 5babdce86214..35ede458045d 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -17,9 +17,8 @@ import unittest import requests -from parameterized import parameterized -from transformers import BitsAndBytesConfig, ChameleonConfig, is_torch_available, is_vision_available, set_seed +from transformers import BitsAndBytesConfig, ChameleonConfig, is_torch_available, is_vision_available from transformers.testing_utils import ( Expectations, require_bitsandbytes, @@ -219,37 +218,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = ChameleonModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = ChameleonModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @unittest.skip("Chameleon forces some token ids to be -inf!") def test_batching_equivalence(self): pass diff --git a/tests/models/colqwen2/test_modeling_colqwen2.py b/tests/models/colqwen2/test_modeling_colqwen2.py index 515fee4ccf8d..e9d065ee22ef 100644 --- a/tests/models/colqwen2/test_modeling_colqwen2.py +++ b/tests/models/colqwen2/test_modeling_colqwen2.py @@ -70,7 +70,7 @@ def __init__( "num_hidden_layers": 2, "num_key_value_heads": 2, "rms_norm_eps": 1e-06, - "rope_scaling": {"mrope_section": [4, 6, 6], "rope_type": "default", "type": "default"}, + "rope_parameters": {"mrope_section": [4, 6, 6], "rope_type": "default", "type": "default"}, "sliding_window": 32768, "tie_word_embeddings": True, "vision_config": { diff --git a/tests/models/cwm/test_configuration_cwm.py b/tests/models/cwm/test_configuration_cwm.py index 3d6f49527875..344a653dd8ab 100644 --- a/tests/models/cwm/test_configuration_cwm.py +++ b/tests/models/cwm/test_configuration_cwm.py @@ -31,9 +31,8 @@ def test_default_config(self): # Llama3 defaults self.assertEqual(config.vocab_size, 128256) - self.assertEqual(config.rope_theta, 1_000_000.0) - self.assertIsNotNone(config.rope_scaling) - self.assertEqual(config.rope_scaling["rope_type"], "llama3") + self.assertIsNotNone(config.rope_parameters) + self.assertEqual(config.rope_parameters["rope_type"], "llama3") def test_custom_sliding_window_config(self): config = CwmConfig(sliding_window=4096) @@ -75,18 +74,19 @@ def test_automatic_layer_types_generation(self): self.assertEqual(config.layer_types, expected_types) - def test_rope_scaling_config(self): - custom_rope_scaling = { + def test_rope_parameters_config(self): + custom_rope_parameters = { "factor": 8.0, "high_freq_factor": 2.0, "low_freq_factor": 0.5, "original_max_position_embeddings": 4096, "rope_type": "llama3", + "rope_theta": 1_000_000.0, } - config = CwmConfig(rope_scaling=custom_rope_scaling) + config = CwmConfig(rope_parameters=custom_rope_parameters) - self.assertEqual(config.rope_scaling, custom_rope_scaling) + self.assertEqual(config.rope_parameters, custom_rope_parameters) def test_config_serialization(self): config = CwmConfig( @@ -111,7 +111,7 @@ def test_config_inheritance_from_llama(self): self.assertTrue(hasattr(config, "num_attention_heads")) self.assertTrue(hasattr(config, "num_key_value_heads")) self.assertTrue(hasattr(config, "intermediate_size")) - self.assertTrue(hasattr(config, "rope_theta")) + self.assertTrue(hasattr(config, "rope_parameters")) self.assertTrue(hasattr(config, "attention_dropout")) diff --git a/tests/models/cwm/test_modeling_cwm.py b/tests/models/cwm/test_modeling_cwm.py index e8936c8627dc..eaed2878bb33 100644 --- a/tests/models/cwm/test_modeling_cwm.py +++ b/tests/models/cwm/test_modeling_cwm.py @@ -46,12 +46,13 @@ def get_config(self): config = super().get_config() config.sliding_window = 8192 - config.rope_scaling = { + config.rope_parameters = { "factor": 16.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3", + "rope_theta": 1000000.0, } return config diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py index 53ff6de5449f..5a9bdab52b6b 100644 --- a/tests/models/dbrx/test_modeling_dbrx.py +++ b/tests/models/dbrx/test_modeling_dbrx.py @@ -15,8 +15,6 @@ import unittest -from parameterized import parameterized - from transformers import is_torch_available from transformers.testing_utils import require_torch, slow @@ -108,15 +106,6 @@ def test_disk_offload_safetensors(self): def test_disk_offload_bin(self): pass - @unittest.skip("Dbrx doesn't have RoPE scaling implemented") - def test_model_rope_scaling_frequencies(self): - pass - - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - @unittest.skip("Dbrx doesn't have RoPE scaling implemented") - def test_model_rope_scaling_from_config(self, scaling_type): - pass - @require_torch class DbrxModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/deepseek_v2/test_modeling_deepseek_v2.py b/tests/models/deepseek_v2/test_modeling_deepseek_v2.py index c28b7da71ee7..894b5207d537 100644 --- a/tests/models/deepseek_v2/test_modeling_deepseek_v2.py +++ b/tests/models/deepseek_v2/test_modeling_deepseek_v2.py @@ -107,7 +107,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "linear", "rope_theta": 10000.0, "factor": scaling_factor} linear_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device) linear_freqs_cis_short = linear_scaling_rope(x, position_ids_short) linear_freqs_cis_long = linear_scaling_rope(x, position_ids_long) @@ -116,7 +116,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "dynamic", "rope_theta": 10000.0, "factor": scaling_factor} ntk_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device) ntk_freqs_cis_short = ntk_scaling_rope(x, position_ids_short) ntk_freqs_cis_long = ntk_scaling_rope(x, position_ids_long) @@ -127,7 +127,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Yarn RoPE scaling # Scaling should be over the entire input - config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "yarn", "rope_theta": 10000.0, "factor": scaling_factor} yarn_scaling_rope = DeepseekV2RotaryEmbedding(config=config).to(torch_device) yarn_freqs_cis_short = yarn_scaling_rope(x, position_ids_short) yarn_freqs_cis_long = yarn_scaling_rope(x, position_ids_long) diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index e7a1de3b25ed..a5c696aed1de 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -19,7 +19,7 @@ from packaging import version from parameterized import parameterized -from transformers import AutoTokenizer, DeepseekV3Config, is_torch_available, set_seed +from transformers import AutoTokenizer, DeepseekV3Config, is_torch_available from transformers.testing_utils import ( cleanup, require_read_token, @@ -46,9 +46,6 @@ DeepseekV3ForTokenClassification, DeepseekV3Model, ) - from transformers.models.deepseek_v3.modeling_deepseek_v3 import ( - DeepseekV3RotaryEmbedding, - ) class DeepseekV3ModelTester: @@ -298,104 +295,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("yarn",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = DeepseekV3Model(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = DeepseekV3Model(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - - # Sanity check Yarn RoPE scaling - # Scaling should be over the entire input - config.rope_scaling = {"type": "yarn", "factor": scaling_factor} - yarn_scaling_rope = DeepseekV3RotaryEmbedding(config=config).to(torch_device) - yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) - yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) - torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_short, original_cos_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_torch_large_accelerator @slow def test_eager_matches_sdpa_generate(self): diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py index 1c98627b643e..90f72ab989c2 100644 --- a/tests/models/diffllama/test_modeling_diffllama.py +++ b/tests/models/diffllama/test_modeling_diffllama.py @@ -19,9 +19,8 @@ import pytest from packaging import version -from parameterized import parameterized -from transformers import AutoTokenizer, BitsAndBytesConfig, DiffLlamaConfig, StaticCache, is_torch_available, set_seed +from transformers import AutoTokenizer, BitsAndBytesConfig, DiffLlamaConfig, StaticCache, is_torch_available from transformers.testing_utils import ( backend_empty_cache, cleanup, @@ -51,9 +50,6 @@ DiffLlamaForTokenClassification, DiffLlamaModel, ) - from transformers.models.diffllama.modeling_diffllama import ( - DiffLlamaRotaryEmbedding, - ) class DiffLlamaModelTester: @@ -273,104 +269,6 @@ def test_diffllama_token_classification_model(self): (self.model_tester.batch_size, self.model_tester.seq_length, self.model_tester.num_labels), ) - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = DiffLlamaModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = DiffLlamaModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - - # Sanity check Yarn RoPE scaling - # Scaling should be over the entire input - config.rope_scaling = {"type": "yarn", "factor": scaling_factor} - yarn_scaling_rope = DiffLlamaRotaryEmbedding(config=config).to(torch_device) - yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) - yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) - torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_short, original_cos_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_long, original_sin_long) - def test_model_loading_old_rope_configs(self): def _reinitialize_config(base_config, new_kwargs): # Reinitialize the config with the new kwargs, forcing the config to go through its __init__ validation @@ -385,27 +283,27 @@ def _reinitialize_config(base_config, new_kwargs): original_model(**model_inputs) # from a config with the expected rope configuration -> βœ… - config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0}}) + config = _reinitialize_config(base_config, {"rope_parameters": {"rope_type": "linear", "factor": 10.0}}) original_model = DiffLlamaForCausalLM(config).to(torch_device) original_model(**model_inputs) # from a config with the old rope configuration ('type' instead of 'rope_type') -> βœ… we gracefully handle BC - config = _reinitialize_config(base_config, {"rope_scaling": {"type": "linear", "factor": 10.0}}) + config = _reinitialize_config(base_config, {"rope_parameters": {"type": "linear", "factor": 10.0}}) original_model = DiffLlamaForCausalLM(config).to(torch_device) original_model(**model_inputs) # from a config with both 'type' and 'rope_type' -> βœ… they can coexist (and both are present in the config) config = _reinitialize_config( - base_config, {"rope_scaling": {"type": "linear", "rope_type": "linear", "factor": 10.0}} + base_config, {"rope_parameters": {"type": "linear", "rope_type": "linear", "factor": 10.0}} ) - self.assertTrue(config.rope_scaling["type"] == "linear") - self.assertTrue(config.rope_scaling["rope_type"] == "linear") + self.assertTrue(config.rope_parameters["type"] == "linear") + self.assertTrue(config.rope_parameters["rope_type"] == "linear") original_model = DiffLlamaForCausalLM(config).to(torch_device) original_model(**model_inputs) # from a config with parameters in a bad range ('factor' should be >= 1.0) -> ⚠️ throws a warning with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear", "factor": -999.0}}) + config = _reinitialize_config(base_config, {"rope_parameters": {"rope_type": "linear", "factor": -999.0}}) original_model = DiffLlamaForCausalLM(config).to(torch_device) original_model(**model_inputs) self.assertEqual(len(logs.output), 1) @@ -414,7 +312,7 @@ def _reinitialize_config(base_config, new_kwargs): # from a config with unknown parameters ('foo' isn't a rope option) -> ⚠️ throws a warning with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: config = _reinitialize_config( - base_config, {"rope_scaling": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} + base_config, {"rope_parameters": {"rope_type": "linear", "factor": 10.0, "foo": "bar"}} ) original_model = DiffLlamaForCausalLM(config).to(torch_device) original_model(**model_inputs) @@ -423,7 +321,9 @@ def _reinitialize_config(base_config, new_kwargs): # from a config with specific rope type but missing one of its mandatory parameters -> ❌ throws exception with self.assertRaises(KeyError): - config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" + config = _reinitialize_config( + base_config, {"rope_parameters": {"rope_type": "linear"}} + ) # missing "factor" @require_flash_attn @require_torch_gpu diff --git a/tests/models/emu3/test_modeling_emu3.py b/tests/models/emu3/test_modeling_emu3.py index 06465e15fd1e..013315894067 100644 --- a/tests/models/emu3/test_modeling_emu3.py +++ b/tests/models/emu3/test_modeling_emu3.py @@ -19,16 +19,8 @@ import pytest import requests from huggingface_hub import hf_hub_download -from parameterized import parameterized - -from transformers import ( - BitsAndBytesConfig, - Emu3Config, - Emu3TextConfig, - is_torch_available, - is_vision_available, - set_seed, -) + +from transformers import BitsAndBytesConfig, Emu3Config, Emu3TextConfig, is_torch_available, is_vision_available from transformers.testing_utils import ( Expectations, require_bitsandbytes, @@ -55,7 +47,6 @@ Emu3ForConditionalGeneration, Emu3Model, Emu3Processor, - Emu3TextModel, ) @@ -149,37 +140,6 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = Emu3TextModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = Emu3TextModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @unittest.skip("Doesn't work, tensors are not almost same") # TODO raushan fixme def test_custom_4d_attention_mask(self): pass diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index 9f466bcd298f..cc23cd8ac67c 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -110,6 +110,10 @@ def test_generation_beyond_sliding_window_tiny_model(self): config.layer_types = ["full_attention", "sliding_attention"] config.sliding_window = 8 config.max_position_embeddings = 128 + config.rope_parameters = { + "full_attention": {"rope_type": "default", "rope_theta": 1000000}, + "sliding_attention": {"rope_type": "default", "rope_theta": 10000}, + } model = AutoModelForCausalLM.from_pretrained( "hf-internal-testing/tiny-random-Gemma3ForCausalLM", config=config ).to(torch_device) @@ -131,6 +135,7 @@ def test_generation_beyond_sliding_window_tiny_model(self): do_sample=False, use_cache=True, cache_implementation="hybrid", + disable_compile=True, ) # 2 generations are needed to trigger https://github.com/huggingface/transformers/issues/39711 # Since it requires model._cache to have been previously initialized @@ -141,8 +146,10 @@ def test_generation_beyond_sliding_window_tiny_model(self): do_sample=False, use_cache=True, cache_implementation="hybrid", + disable_compile=True, ) generated_sequences = output[:, input_len:].cpu() + print(generated_sequences) EXPECTED_OUTPUT = torch.tensor([[90109, 90109, 90109, 83191, 83191], [246901, 69832, 69832, 69832, 62288]]) torch.testing.assert_close(generated_sequences, EXPECTED_OUTPUT) @@ -151,6 +158,96 @@ def test_generation_beyond_sliding_window_tiny_model(self): def test_model_rope_scaling_from_config(self): pass + def test_model_rope_scaling_frequencies(self): + """Tests the frequency properties of the different RoPE scaling types on the model RoPE layer.""" + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + # Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the + # named location of the RoPE layer class. + base_model = self.model_tester.base_model_class(config) + possible_rope_attributes = [ + "pos_emb", + "rotary_emb", # most common case + "global_rotary_emb", + "local_rotary_emb", + ] + for name, module in base_model.named_modules(): + if any(potential_name in name for potential_name in possible_rope_attributes): + rope_class = type(module) + break + + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + rope_params = {"rope_type": "default", "rope_theta": 10_000.0} + config.rope_parameters = {"full_attention": rope_params, "sliding_attention": rope_params} + original_rope = rope_class(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short, layer_type="sliding_attention") + original_cos_long, original_sin_long = original_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + rope_params = {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"full_attention": rope_params, "sliding_attention": rope_params} + linear_scaling_rope = rope_class(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + rope_params = {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"full_attention": rope_params, "sliding_attention": rope_params} + ntk_scaling_rope = rope_class(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue( + (ntk_scaling_rope.sliding_attention_inv_freq <= original_rope.sliding_attention_inv_freq).all() + ) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + rope_params = {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"full_attention": rope_params, "sliding_attention": rope_params} + yarn_scaling_rope = rope_class(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + class Gemma3Vision2TextModelTester: def __init__( diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py index 0d3df2ba86ae..c279d98ac3c8 100644 --- a/tests/models/gemma3n/test_modeling_gemma3n.py +++ b/tests/models/gemma3n/test_modeling_gemma3n.py @@ -551,6 +551,97 @@ def test_generate_with_static_cache(self): dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict) self.assertTrue(has_similar_generate_outputs(dynamic_cache_generation, static_cache_generation)) + def test_model_rope_scaling_frequencies(self): + """Tests the frequency properties of the different RoPE scaling types on the model RoPE layer.""" + # Gemma3n has different RoPE configs per layer type + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + # Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the + # named location of the RoPE layer class. + base_model = self.model_tester.base_model_class(config) + possible_rope_attributes = [ + "pos_emb", + "rotary_emb", # most common case + "global_rotary_emb", + "local_rotary_emb", + ] + for name, module in base_model.named_modules(): + if any(potential_name in name for potential_name in possible_rope_attributes): + rope_class = type(module) + break + + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + rope_params = {"rope_type": "default", "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + original_rope = rope_class(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short, layer_type="sliding_attention") + original_cos_long, original_sin_long = original_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + rope_params = {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + linear_scaling_rope = rope_class(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + rope_params = {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + ntk_scaling_rope = rope_class(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue( + (ntk_scaling_rope.sliding_attention_inv_freq <= original_rope.sliding_attention_inv_freq).all() + ) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + rope_params = {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + yarn_scaling_rope = rope_class(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + class Gemma3nVision2TextModelTester: text_config = {"activation_sparsity_pattern": None} diff --git a/tests/models/glm4v/test_modeling_glm4v.py b/tests/models/glm4v/test_modeling_glm4v.py index 0e8d07b1c7df..a27b277ce36a 100644 --- a/tests/models/glm4v/test_modeling_glm4v.py +++ b/tests/models/glm4v/test_modeling_glm4v.py @@ -71,7 +71,7 @@ def __init__( "output_channels": 64, "hidden_act": "silu", "max_position_embeddings": 512, - "rope_scaling": {"type": "default", "mrope_section": [2, 1, 1]}, + "rope_parameters": {"type": "default", "mrope_section": [2, 1, 1]}, "rope_theta": 10000, "tie_word_embeddings": True, "bos_token_id": 0, diff --git a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py index ad3986ce3641..9c764ee38507 100644 --- a/tests/models/glm4v_moe/test_modeling_glm4v_moe.py +++ b/tests/models/glm4v_moe/test_modeling_glm4v_moe.py @@ -72,7 +72,7 @@ def __init__( "output_channels": 64, "hidden_act": "silu", "max_position_embeddings": 512, - "rope_scaling": {"type": "default", "mrope_section": [1, 1]}, + "rope_parameters": {"type": "default", "mrope_section": [1, 1]}, "rope_theta": 10000, "tie_word_embeddings": True, "bos_token_id": 0, diff --git a/tests/models/gpt_neox/test_modeling_gpt_neox.py b/tests/models/gpt_neox/test_modeling_gpt_neox.py index 7d359b31cd60..b64e3d2f912d 100644 --- a/tests/models/gpt_neox/test_modeling_gpt_neox.py +++ b/tests/models/gpt_neox/test_modeling_gpt_neox.py @@ -15,9 +15,7 @@ import unittest -from parameterized import parameterized - -from transformers import AutoTokenizer, DynamicCache, GPTNeoXConfig, is_torch_available, set_seed +from transformers import AutoTokenizer, DynamicCache, GPTNeoXConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device from ...generation.test_utils import GenerationTesterMixin @@ -36,7 +34,6 @@ GPTNeoXForTokenClassification, GPTNeoXModel, ) - from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXRotaryEmbedding class GPTNeoXModelTester: @@ -338,87 +335,6 @@ def test_cached_forward_with_and_without_attention_mask(self): def test_feed_forward_chunking(self): pass - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = GPTNeoXModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = GPTNeoXModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = GPTNeoXRotaryEmbedding(config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_torch class GPTNeoXLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 2e1223394f20..4afa595d6a16 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -15,9 +15,7 @@ import unittest -from parameterized import parameterized - -from transformers import GraniteConfig, is_torch_available, set_seed +from transformers import GraniteConfig, is_torch_available from transformers.testing_utils import ( Expectations, require_read_token, @@ -40,9 +38,6 @@ GraniteForCausalLM, GraniteModel, ) - from transformers.models.granite.modeling_granite import ( - GraniteRotaryEmbedding, - ) class GraniteModelTester: @@ -197,104 +192,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = GraniteModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = GraniteModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = GraniteRotaryEmbedding(config=config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - - # Sanity check Yarn RoPE scaling - # Scaling should be over the entire input - config.rope_scaling = {"type": "yarn", "factor": scaling_factor} - yarn_scaling_rope = GraniteRotaryEmbedding(config=config).to(torch_device) - yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) - yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) - torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_short, original_cos_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_torch_accelerator class GraniteIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index fd5de1bcc369..7d6bc3f6d21c 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -15,9 +15,7 @@ import unittest -from parameterized import parameterized - -from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed +from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available from transformers.testing_utils import ( Expectations, require_read_token, @@ -39,9 +37,6 @@ GraniteMoeForCausalLM, GraniteMoeModel, ) - from transformers.models.granitemoe.modeling_granitemoe import ( - GraniteMoeRotaryEmbedding, - ) class GraniteMoeModelTester: @@ -196,104 +191,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = GraniteMoeModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = GraniteMoeModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - - # Sanity check Yarn RoPE scaling - # Scaling should be over the entire input - config.rope_scaling = {"type": "yarn", "factor": scaling_factor} - yarn_scaling_rope = GraniteMoeRotaryEmbedding(config=config).to(torch_device) - yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) - yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) - torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_short, original_cos_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_torch_accelerator class GraniteMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py index 7c19e05ab339..f99f555e4193 100644 --- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py +++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -15,9 +15,7 @@ import unittest -from parameterized import parameterized - -from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available, set_seed +from transformers import AutoTokenizer, GraniteMoeSharedConfig, is_torch_available from transformers.testing_utils import ( Expectations, require_read_token, @@ -39,9 +37,6 @@ GraniteMoeSharedForCausalLM, GraniteMoeSharedModel, ) - from transformers.models.granitemoeshared.modeling_granitemoeshared import ( - GraniteMoeSharedRotaryEmbedding, - ) class GraniteMoeSharedModelTester: @@ -199,104 +194,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = GraniteMoeSharedModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = GraniteMoeSharedModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - - def test_model_rope_scaling(self): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - scaling_factor = 10 - short_input_length = 10 - long_input_length = int(config.max_position_embeddings * 1.5) - - # Inputs - x = torch.randn( - 1, dtype=torch.float32, device=torch_device - ) # used exclusively to get the dtype and the device - position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) - position_ids_short = position_ids_short.unsqueeze(0) - position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) - position_ids_long = position_ids_long.unsqueeze(0) - - # Sanity check original RoPE - original_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) - original_cos_short, original_sin_short = original_rope(x, position_ids_short) - original_cos_long, original_sin_long = original_rope(x, position_ids_long) - torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) - - # Sanity check linear RoPE scaling - # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"type": "linear", "factor": scaling_factor} - linear_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) - linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) - linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) - torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) - for new_position in range(0, long_input_length, scaling_factor): - original_position = int(new_position // scaling_factor) - torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) - torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) - - # Sanity check Dynamic NTK RoPE scaling - # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase - # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"type": "dynamic", "factor": scaling_factor} - ntk_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) - ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) - ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) - torch.testing.assert_close(ntk_cos_short, original_cos_short) - torch.testing.assert_close(ntk_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(ntk_sin_long, original_sin_long) - self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - - # Sanity check Yarn RoPE scaling - # Scaling should be over the entire input - config.rope_scaling = {"type": "yarn", "factor": scaling_factor} - yarn_scaling_rope = GraniteMoeSharedRotaryEmbedding(config=config).to(torch_device) - yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) - yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) - torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) - torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_short, original_cos_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_short, original_sin_short) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_cos_long, original_cos_long) - with self.assertRaises(AssertionError): - torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_torch_accelerator class GraniteMoeSharedIntegrationTest(unittest.TestCase): diff --git a/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py b/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py index 614606ded0b0..cd578dd6c156 100644 --- a/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py +++ b/tests/models/hunyuan_v1_dense/test_modeling_hunyuan_v1_dense.py @@ -15,8 +15,6 @@ import unittest -from parameterized import parameterized - from transformers import is_torch_available from transformers.testing_utils import ( cleanup, @@ -54,15 +52,6 @@ def is_pipeline_test_to_skip( ): return True - @unittest.skip("HunYuanDenseV1's RoPE has custom parameterization") - def test_model_rope_scaling_frequencies(self): - pass - - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - @unittest.skip("HunYuanDenseV1's RoPE has custom parameterization") - def test_model_rope_scaling_from_config(self, scaling_type): - pass - @require_torch class HunYuanDenseV1IntegrationTest(unittest.TestCase): diff --git a/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py b/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py index eb99a71f78bc..f93e82e63d77 100644 --- a/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py +++ b/tests/models/hunyuan_v1_moe/test_modeling_hunyuan_v1_moe.py @@ -17,7 +17,6 @@ import pytest import torch -from parameterized import parameterized from transformers import is_torch_available from transformers.testing_utils import ( @@ -78,15 +77,6 @@ def test_generate_from_inputs_embeds_with_static_cache(self): def test_generate_with_static_cache(self): pass - @unittest.skip("HunYuanMoEV1's RoPE has custom parameterization") - def test_model_rope_scaling_frequencies(self): - pass - - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - @unittest.skip("HunYuanMoEV1's RoPE has custom parameterization") - def test_model_rope_scaling_from_config(self, scaling_type): - pass - @require_torch class HunYuanMoEV1IntegrationTest(unittest.TestCase): diff --git a/tests/models/longcat_flash/test_modeling_longcat_flash.py b/tests/models/longcat_flash/test_modeling_longcat_flash.py index 2310605357a4..981d0edb9702 100644 --- a/tests/models/longcat_flash/test_modeling_longcat_flash.py +++ b/tests/models/longcat_flash/test_modeling_longcat_flash.py @@ -17,10 +17,9 @@ import tempfile import unittest -from parameterized import parameterized from pytest import mark -from transformers import LongcatFlashConfig, is_torch_available, set_seed +from transformers import LongcatFlashConfig, is_torch_available from transformers.testing_utils import ( require_bitsandbytes, require_flash_attn, @@ -285,34 +284,6 @@ def _prepare_config_headdim(config, requested_dim): return config - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - def test_model_rope_scaling_from_config(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) - original_model = self.model_tester_class.base_model_class(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = self.model_tester_class.base_model_class(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 14dddac95410..e302c0da63d7 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -67,7 +67,7 @@ def is_pipeline_test_to_skip( @pytest.mark.flash_attn_test @slow def test_flash_attn_2_inference_equivalence_right_padding(self): - self.skipTest(reason="Mistral flash attention does not support right padding") + self.skipTest(reason="Mixtral flash attention does not support right padding") # Ignore copy def test_load_balancing_loss(self): diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 06c46044f57a..77ca628231fa 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -72,7 +72,7 @@ def __init__( "hidden_act": "gelu", "max_position_embeddings": 512, "initializer_range": 0.02, - "rope_scaling": {"rope_type": "default"}, + "rope_parameters": {"rope_type": "default"}, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, @@ -150,7 +150,7 @@ def __init__( "hidden_act": "gelu", "max_position_embeddings": 512, "initializer_range": 0.02, - "rope_scaling": {"rope_type": "default"}, + "rope_parameters": {"rope_type": "default"}, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, diff --git a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py index 8737f0fa2771..2e2e3cbb31b9 100644 --- a/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py +++ b/tests/models/modernbert_decoder/test_modeling_modernbert_decoder.py @@ -19,6 +19,7 @@ from transformers.testing_utils import ( require_torch, slow, + torch_device, ) from ...causal_lm_tester import CausalLMModelTest, CausalLMModelTester @@ -43,6 +44,97 @@ class ModernBertDecoderModelTester(CausalLMModelTester): class ModernBertDecoderModelTest(CausalLMModelTest, unittest.TestCase): model_tester_class = ModernBertDecoderModelTester + def test_model_rope_scaling_frequencies(self): + """Tests the frequency properties of the different RoPE scaling types on the model RoPE layer.""" + # ModernBertDecoder has different RoPE configs per layer type + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + # Retrieves the RoPE layer class from the base model class. Uses `.named_modules()` to avoid hardcoding the + # named location of the RoPE layer class. + base_model = self.model_tester.base_model_class(config) + possible_rope_attributes = [ + "pos_emb", + "rotary_emb", # most common case + "global_rotary_emb", + "local_rotary_emb", + ] + for name, module in base_model.named_modules(): + if any(potential_name in name for potential_name in possible_rope_attributes): + rope_class = type(module) + break + + scaling_factor = 10 + short_input_length = 10 + long_input_length = int(config.max_position_embeddings * 1.5) + + # Inputs + x = torch.randn( + 1, dtype=torch.float32, device=torch_device + ) # used exclusively to get the dtype and the device + position_ids_short = torch.arange(short_input_length, dtype=torch.long, device=torch_device) + position_ids_short = position_ids_short.unsqueeze(0) + position_ids_long = torch.arange(long_input_length, dtype=torch.long, device=torch_device) + position_ids_long = position_ids_long.unsqueeze(0) + + # Sanity check original RoPE + rope_params = {"rope_type": "default", "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + original_rope = rope_class(config=config).to(torch_device) + original_cos_short, original_sin_short = original_rope(x, position_ids_short, layer_type="sliding_attention") + original_cos_long, original_sin_long = original_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(original_cos_short, original_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(original_sin_short, original_sin_long[:, :short_input_length, :]) + + # Sanity check linear RoPE scaling + # New position "x" should match original position with index "x/scaling_factor" + rope_params = {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + linear_scaling_rope = rope_class(config=config).to(torch_device) + linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(linear_cos_short, linear_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(linear_sin_short, linear_sin_long[:, :short_input_length, :]) + for new_position in range(0, long_input_length, scaling_factor): + original_position = int(new_position // scaling_factor) + torch.testing.assert_close(linear_cos_long[:, new_position, :], original_cos_long[:, original_position, :]) + torch.testing.assert_close(linear_sin_long[:, new_position, :], original_sin_long[:, original_position, :]) + + # Sanity check Dynamic NTK RoPE scaling + # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase + # with scaling_factor (or that `inv_freq` decreases) + rope_params = {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + ntk_scaling_rope = rope_class(config=config).to(torch_device) + ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(ntk_cos_short, original_cos_short) + torch.testing.assert_close(ntk_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(ntk_sin_long, original_sin_long) + self.assertTrue( + (ntk_scaling_rope.sliding_attention_inv_freq <= original_rope.sliding_attention_inv_freq).all() + ) + + # Sanity check Yarn RoPE scaling + # Scaling should be over the entire input + rope_params = {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0} + config.rope_parameters = {"sliding_attention": rope_params, "full_attention": rope_params} + yarn_scaling_rope = rope_class(config=config).to(torch_device) + yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short, layer_type="sliding_attention") + yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long, layer_type="sliding_attention") + torch.testing.assert_close(yarn_cos_short, yarn_cos_long[:, :short_input_length, :]) + torch.testing.assert_close(yarn_sin_short, yarn_sin_long[:, :short_input_length, :]) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_short, original_cos_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_short, original_sin_short) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_cos_long, original_cos_long) + with self.assertRaises(AssertionError): + torch.testing.assert_close(yarn_sin_long, original_sin_long) + @slow @require_torch diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 42a9914541c5..a330dd92f83a 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -16,8 +16,6 @@ import unittest -from parameterized import parameterized - from transformers import is_torch_available from transformers.testing_utils import ( Expectations, @@ -61,15 +59,6 @@ class NemotronModelTest(CausalLMModelTest, unittest.TestCase): def test_model_outputs_equivalence(self, **kwargs): pass - @unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling") - def test_model_rope_scaling_frequencies(self): - pass - - @parameterized.expand([("linear",), ("dynamic",), ("yarn",)]) - @unittest.skip("Nemotron has a hardcoded `rope_type`, so we can't apply RoPE scaling") - def test_model_rope_scaling_from_config(self, scaling_type): - pass - @require_torch_accelerator class NemotronIntegrationTest(unittest.TestCase): diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index 8059da92a649..8b849977fcba 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -17,9 +17,8 @@ import pytest from packaging import version -from parameterized import parameterized -from transformers import OlmoConfig, is_torch_available, set_seed +from transformers import OlmoConfig, is_torch_available from transformers.generation.configuration_utils import GenerationConfig from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast @@ -190,37 +189,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = OlmoModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = OlmoModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_torch class OlmoIntegrationTest(unittest.TestCase): diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py index 812e2ce15714..59443d4b83c2 100644 --- a/tests/models/olmo2/test_modeling_olmo2.py +++ b/tests/models/olmo2/test_modeling_olmo2.py @@ -17,9 +17,8 @@ import pytest from packaging import version -from parameterized import parameterized -from transformers import Olmo2Config, is_torch_available, set_seed +from transformers import Olmo2Config, is_torch_available from transformers.generation.configuration_utils import GenerationConfig from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.testing_utils import ( @@ -191,37 +190,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = Olmo2Model(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = Olmo2Model(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_torch class Olmo2IntegrationTest(unittest.TestCase): diff --git a/tests/models/olmo3/test_modeling_olmo3.py b/tests/models/olmo3/test_modeling_olmo3.py index c33b39282964..256bb638cf10 100644 --- a/tests/models/olmo3/test_modeling_olmo3.py +++ b/tests/models/olmo3/test_modeling_olmo3.py @@ -82,7 +82,7 @@ def test_model_rope_scaling_from_config(self, scaling_type): original_long_output = original_model(long_input).last_hidden_state set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} + config.rope_parameters = {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0} scaled_model = self.model_tester_class.base_model_class(config) scaled_model.to(torch_device) scaled_model.eval() @@ -120,7 +120,7 @@ def test_model_rope_scaling_frequencies(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - config.rope_scaling = {"rope_type": "default"} + config.rope_parameters = {"rope_type": "default", "rope_theta": 10_000.0} original_rope = rope_class(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) @@ -129,7 +129,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - config.rope_scaling = {"rope_type": "linear", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0} linear_scaling_rope = rope_class(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) @@ -143,7 +143,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - config.rope_scaling = {"rope_type": "dynamic", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0} ntk_scaling_rope = rope_class(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) @@ -157,7 +157,7 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Yarn RoPE scaling # Scaling should be over the entire input - config.rope_scaling = {"rope_type": "yarn", "factor": scaling_factor} + config.rope_parameters = {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0} yarn_scaling_rope = rope_class(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py index 15bbe645654f..99287e714af6 100644 --- a/tests/models/olmoe/test_modeling_olmoe.py +++ b/tests/models/olmoe/test_modeling_olmoe.py @@ -15,9 +15,7 @@ import unittest -from parameterized import parameterized - -from transformers import OlmoeConfig, is_torch_available, set_seed +from transformers import OlmoeConfig, is_torch_available from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.gpt_neox.tokenization_gpt_neox_fast import GPTNeoXTokenizerFast from transformers.testing_utils import ( @@ -202,37 +200,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) - def test_model_rope_scaling(self, scaling_type): - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - short_input = ids_tensor([1, 10], config.vocab_size) - long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - original_model = OlmoeModel(config) - original_model.to(torch_device) - original_model.eval() - original_short_output = original_model(short_input).last_hidden_state - original_long_output = original_model(long_input).last_hidden_state - - set_seed(42) # Fixed seed at init time so the two models get the same random weights - config.rope_scaling = {"type": scaling_type, "factor": 10.0} - scaled_model = OlmoeModel(config) - scaled_model.to(torch_device) - scaled_model.eval() - scaled_short_output = scaled_model(short_input).last_hidden_state - scaled_long_output = scaled_model(long_input).last_hidden_state - - # Dynamic scaling does not change the RoPE embeddings until it receives an input longer than the original - # maximum sequence length, so the outputs for the short input should match. - if scaling_type == "dynamic": - torch.testing.assert_close(original_short_output, scaled_short_output, rtol=1e-5, atol=1e-5) - else: - self.assertFalse(torch.allclose(original_short_output, scaled_short_output, atol=1e-5)) - - # The output should be different for long inputs - self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_torch class OlmoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py index 097c217b4aa7..2d6270a06abc 100644 --- a/tests/models/phi3/test_modeling_phi3.py +++ b/tests/models/phi3/test_modeling_phi3.py @@ -352,8 +352,8 @@ def test_export_static_cache(self): # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite # that function to avoid the data-dependent control flow. - if hasattr(config, "rope_scaling") and config.rope_scaling is not None: - config.rope_scaling["type"] = "default" + if hasattr(config, "rope_parameters") and config.rope_parameters is not None: + config.rope_parameters["type"] = "default" # Load model device = "cpu" # TODO (joao / export experts): should be on `torch_device`, but causes GPU OOM diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index e35ab1f16d92..2b36f597de3b 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -95,7 +95,7 @@ def __init__( "output_dim": 32, }, text_config={ - "rope_scaling": {"mrope_section": [1, 1, 2], "rope_type": "default", "type": "default"}, + "rope_parameters": {"mrope_section": [1, 1, 2], "rope_type": "default", "type": "default"}, "vocab_size": 99, "hidden_size": 32, "intermediate_size": 37, diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index fb1802577d29..f53e7422f168 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -86,6 +86,7 @@ def __init__( tie_word_embeddings=True, is_training=True, vision_config=None, + rope_parameters=None, vision_start_token_id=3, image_token_id=4, video_token_id=5, @@ -139,7 +140,7 @@ def __init__( "rope_theta": rope_theta, "tie_word_embeddings": tie_word_embeddings, "vocab_size": vocab_size, - "rope_scaling": {"type": "mrope", "mrope_section": [2, 1, 1]}, + "rope_parameters": {"type": "mrope", "mrope_section": [2, 1, 1]}, } def get_config(self): diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index d8a31a777ba0..bab674115b83 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -79,7 +79,7 @@ def __init__( "num_key_value_heads": 2, "rope_theta": 10000, "tie_word_embeddings": True, - "rope_scaling": {"type": "mrope", "mrope_section": [2, 1, 1]}, + "rope_parameters": {"type": "mrope", "mrope_section": [2, 1, 1]}, }, vision_start_token_id=3, image_token_id=4, diff --git a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py index 711451992d46..ebdb16bc1714 100644 --- a/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py +++ b/tests/models/qwen3_omni_moe/test_modeling_qwen3_omni_moe.py @@ -109,7 +109,7 @@ def __init__( "n_window_infer": 100, } self.text_config = { - "rope_scaling": { + "rope_parameters": { "mrope_section": [1, 1, 2], "rope_type": "default", "type": "default", diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py index bb13a3fde76f..29bc8ea40aba 100644 --- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py @@ -65,7 +65,7 @@ def __init__( "num_key_value_heads": 2, "rope_theta": 10000, "tie_word_embeddings": True, - "rope_scaling": {"rope_type": "default", "mrope_section": [16, 8, 8], "mrope_interleaved": True}, + "rope_parameters": {"rope_type": "default", "mrope_section": [16, 8, 8], "mrope_interleaved": True}, }, vision_config={ "depth": 2, @@ -106,7 +106,7 @@ def __init__( self.num_attention_heads = text_config["num_attention_heads"] self.num_key_value_heads = text_config["num_key_value_heads"] self.rope_theta = text_config["rope_theta"] - self.rope_scaling = text_config["rope_scaling"] + self.rope_parameters = text_config["rope_parameters"] self.hidden_act = text_config["hidden_act"] self.max_position_embeddings = text_config["max_position_embeddings"] self.model_type = text_config["model_type"] diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index 4e165eae2e41..ccfbca9948e9 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -72,7 +72,7 @@ def __init__( "num_experts": 8, "rope_theta": 10000, "tie_word_embeddings": True, - "rope_scaling": {"rope_type": "default", "mrope_section": [16, 8, 8], "mrope_interleaved": True}, + "rope_parameters": {"rope_type": "default", "mrope_section": [16, 8, 8], "mrope_interleaved": True}, }, vision_config={ "depth": 2, @@ -112,7 +112,7 @@ def __init__( self.num_attention_heads = text_config["num_attention_heads"] self.num_key_value_heads = text_config["num_key_value_heads"] self.rope_theta = text_config["rope_theta"] - self.rope_scaling = text_config["rope_scaling"] + self.rope_parameters = text_config["rope_parameters"] self.hidden_act = text_config["hidden_act"] self.max_position_embeddings = text_config["max_position_embeddings"] self.model_type = text_config["model_type"] diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 0e0c444f8eec..6f7b86c75305 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3737,6 +3737,7 @@ def test_sliding_window_mask(self): config.use_sliding_window = True config_dict = config.to_diff_dict() config_dict.pop("layer_types", None) + config_dict.pop("rope_parameters", None) new_config = config.__class__(**config_dict) # We need to set eager as otherwise `output_attentions` is not supported model = model_class._from_config(new_config, attn_implementation="eager").to(torch_device) @@ -3754,6 +3755,7 @@ def test_sliding_window_mask(self): config.use_sliding_window = False config_dict = config.to_diff_dict() config_dict.pop("layer_types", None) + config_dict.pop("rope_parameters", None) new_config = config.__class__(**config_dict) # We need to set eager as otherwise `output_attentions` is not supported model = model_class._from_config(new_config, attn_implementation="eager").to(torch_device) @@ -3936,12 +3938,12 @@ def update_config_headdim(config, requested_dim): # 3d rope also depends on the head dim # (we assume easy shapes here where we get to the requested head dim at least) if ( - getattr(config, "rope_scaling", None) is not None - and len(config.rope_scaling.get("mrope_section", [])) > 0 + getattr(config, "rope_parameters", None) is not None + and len(config.rope_parameters.get("mrope_section", [])) > 0 ): - scaling_factor = max(requested_dim // (sum(config.rope_scaling["mrope_section"]) * 2), 1) - config.rope_scaling["mrope_section"] = [ - section * scaling_factor for section in config.rope_scaling["mrope_section"] + scaling_factor = max(requested_dim // (sum(config.rope_parameters["mrope_section"]) * 2), 1) + config.rope_parameters["mrope_section"] = [ + section * scaling_factor for section in config.rope_parameters["mrope_section"] ] # Update config values diff --git a/tests/utils/test_add_new_model_like.py b/tests/utils/test_add_new_model_like.py index c1eeed5b824c..5fcdf8e92fe9 100644 --- a/tests/utils/test_add_new_model_like.py +++ b/tests/utils/test_add_new_model_like.py @@ -621,11 +621,11 @@ class MyTest2FeatureEmbedding(Phi4MultimodalFeatureEmbedding): pass - class MyTest2RotaryEmbedding(Phi4MultimodalRotaryEmbedding): + class MyTest2PreTrainedModel(Phi4MultimodalPreTrainedModel): pass - class MyTest2PreTrainedModel(Phi4MultimodalPreTrainedModel): + class MyTest2RotaryEmbedding(Phi4MultimodalRotaryEmbedding): pass diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 7e000e0ff1a1..3cf063ac116e 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -25,6 +25,7 @@ from transformers import ROPE_INIT_FUNCTIONS from transformers.modeling_rope_utils import rope_config_validation + from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding @require_torch @@ -38,10 +39,9 @@ def test_rope_validation(self): # If we explicitly set the other RoPE types, then validation should fail for rope_type in all_rope_types: - if rope_type != "default": - config.rope_scaling = {"rope_type": rope_type} - with self.assertRaises(KeyError): - rope_config_validation(config) + config.rope_parameters = {"rope_type": rope_type, "rope_theta": 10000.0} + with self.assertRaises(KeyError): + rope_config_validation(config) # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed valid_param_mapping = { @@ -53,11 +53,9 @@ def test_rope_validation(self): "long_factor": ["longrope"], } for rope_type in all_rope_types: - if rope_type == "default": - continue # checked above for param, valid_rope_types in valid_param_mapping.items(): # Set `param` with a dummy value -- we want to test the dict key - config.rope_scaling = {"rope_type": rope_type, param: True} + config.rope_parameters = {"rope_type": rope_type, "rope_theta": 10000.0, param: True} if rope_type in valid_rope_types: continue else: @@ -68,14 +66,25 @@ def test_rope_validation(self): # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys` model_specific_kwarg = "mrope_sections" # e,g in Qwen2-VL - for rope_type in all_rope_types: - if rope_type == "default": - config.rope_scaling = {"rope_type": rope_type, model_specific_kwarg: True} - rope_config_validation(config, ignore_keys={model_specific_kwarg}) - with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) - self.assertEqual(len(logs.output), 1) - self.assertIn(model_specific_kwarg, logs.output[0]) + config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, model_specific_kwarg: True} + rope_config_validation(config, ignore_keys={model_specific_kwarg}) + with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: + rope_config_validation(config) + self.assertEqual(len(logs.output), 1) + self.assertIn(model_specific_kwarg, logs.output[0]) + + # We can indicate Different RoPE params for each attention type + # We can also have only one RoPE params defined for all layer, we don't raise an error + # because it is not required to have separate RoPE per layer type + config.layer_types = ["global_attn", "local_attn"] + config.rope_parameters = { + "global_attn": {"rope_type": "default", "rope_theta": 10000}, + "local_attn": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, + } + rope_config_validation(config) + + config.rope_parameters = config.rope_parameters["local_attn"] + rope_config_validation(config) def test_yarn_original_original_max_position_embeddings_validation(self): """Tests that models with no/bad `original_max_position_embeddings` raise a warning""" @@ -84,10 +93,11 @@ def test_yarn_original_original_max_position_embeddings_validation(self): # good rope config: has a factor AND original_max_position_embeddings -> no warnings rope_config = { "rope_type": "yarn", + "rope_theta": 10000.0, "factor": 2.0, "original_max_position_embeddings": int(config.max_position_embeddings / 2.0), } - config.rope_scaling = rope_config + config.rope_parameters = rope_config with self.assertRaises(AssertionError): # confirm that no warnings are thrown with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: rope_config_validation(config) @@ -95,9 +105,10 @@ def test_yarn_original_original_max_position_embeddings_validation(self): # bad rope config, no `original_max_position_embeddings` -> warning rope_config = { "rope_type": "yarn", + "rope_theta": 10000.0, "factor": 2.0, } - config.rope_scaling = rope_config + config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: rope_config_validation(config) self.assertEqual(len(logs.output), 1) @@ -106,10 +117,11 @@ def test_yarn_original_original_max_position_embeddings_validation(self): # bad rope config, bad implicit fator -> warning rope_config = { "rope_type": "yarn", + "rope_theta": 10000.0, "factor": 2.0, "original_max_position_embeddings": 1, } - config.rope_scaling = rope_config + config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: rope_config_validation(config) self.assertEqual(len(logs.output), 1) @@ -138,13 +150,12 @@ def test_default_rope_numerically(self): # input sanity checks: if these change, the output will also change config = LlamaConfig() - self.assertEqual(config.rope_scaling, None) + self.assertEqual(config.rope_parameters, {"rope_type": "default", "rope_theta": 10000.0}) self.assertEqual(config.hidden_size, 4096) self.assertEqual(config.num_attention_heads, 32) - self.assertEqual(config.rope_theta, 10000.0) self.assertFalse(hasattr(config, "partial_rotary_factor")) - rope_fn = ROPE_INIT_FUNCTIONS["default"] + rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters inv_freq, attention_scale = rope_fn(config=config, device=torch_device) self.assertEqual(attention_scale, 1.0) # attention scale is always 1 for default RoPE @@ -154,12 +165,12 @@ def test_linear_rope_numerically(self): # This is a linear scaling strategy, the **frequencies** are scaled linearly with respect to the default # frequencies (= the inverse frequencies are scaled **inversely**) config = LlamaConfig() - default_rope_fn = ROPE_INIT_FUNCTIONS["default"] + default_rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters default_inv_freq, _ = default_rope_fn(config=config, device=torch_device) rope_fn = ROPE_INIT_FUNCTIONS["linear"] for factor in (2.0, 10.0, 20.0): - config.rope_scaling = {"rope_type": "linear", "factor": factor} + config.rope_parameters = {"rope_type": "linear", "rope_theta": 10000.0, "factor": factor} inv_freq, attention_scale = rope_fn(config=config, device=torch_device) self.assertEqual(attention_scale, 1.0) # attention scale is always 1 for linear RoPE torch.testing.assert_close(inv_freq, default_inv_freq / factor) @@ -185,20 +196,19 @@ def test_dynamic_rope_numerically(self): # input sanity checks: if these change, the output will also change config = LlamaConfig() - self.assertEqual(config.rope_scaling, None) + self.assertEqual(config.rope_parameters, {"rope_type": "default", "rope_theta": 10000.0}) self.assertEqual(config.hidden_size, 4096) self.assertEqual(config.num_attention_heads, 32) - self.assertEqual(config.rope_theta, 10000.0) self.assertFalse(hasattr(config, "partial_rotary_factor")) - rope_fn = ROPE_INIT_FUNCTIONS["default"] + rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters default_inv_freq, _ = rope_fn(config=config, device=torch_device) # Check 1: this is a dynamic scaling strategy, it will not scale unless we provide `seq_len` larger than the # model's original training sequence length rope_fn = ROPE_INIT_FUNCTIONS["dynamic"] for factor in (2.0, 10.0, 20.0): - config.rope_scaling = {"rope_type": "dynamic", "factor": factor} + config.rope_parameters = {"rope_type": "dynamic", "rope_theta": 10000.0, "factor": factor} inv_freq, attention_scale = rope_fn(config=config, device=torch_device) self.assertEqual(attention_scale, 1.0) # attention scale is always 1 for dynamic RoPE torch.testing.assert_close(inv_freq, default_inv_freq) @@ -212,7 +222,7 @@ def test_dynamic_rope_numerically(self): # Check 2: if we provide `seq_len` larger than the model's original training sequence length, the frequencies # will scale up (i.e., the inverse frequencies will scale down). factor = 10.0 - config.rope_scaling = {"rope_type": "dynamic", "factor": factor} + config.rope_parameters = {"rope_type": "dynamic", "rope_theta": 10000.0, "factor": factor} inv_freq, _ = rope_fn(config=config, device=torch_device, seq_len=16384) with self.assertRaises(AssertionError): # It is NOT a linear factor torch.testing.assert_close(inv_freq, default_inv_freq / factor) @@ -239,24 +249,28 @@ def test_yarn_rope_numerically(self): # input sanity checks: if these change, the output will also change config = LlamaConfig() - self.assertEqual(config.rope_scaling, None) + self.assertEqual(config.rope_parameters, {"rope_type": "default", "rope_theta": 10000.0}) self.assertEqual(config.hidden_size, 4096) self.assertEqual(config.num_attention_heads, 32) - self.assertEqual(config.rope_theta, 10000.0) self.assertFalse(hasattr(config, "partial_rotary_factor")) - rope_fn = ROPE_INIT_FUNCTIONS["default"] + rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters default_inv_freq, _ = rope_fn(config=config, device=torch_device) # Check 1: according to the paper, if `attention_factor` is not specified, then it has a specific default -- # `0.1 * math.log(factor) + 1.0` rope_fn = ROPE_INIT_FUNCTIONS["yarn"] for factor in (2.0, 10.0, 20.0): - config.rope_scaling = {"rope_type": "yarn", "factor": factor} + config.rope_parameters = {"rope_type": "yarn", "rope_theta": 10000.0, "factor": factor} _, attention_scale = rope_fn(config=config, device=torch_device) self.assertEqual(attention_scale, 0.1 * math.log(factor) + 1.0) - config.rope_scaling = {"rope_type": "yarn", "factor": factor, "attention_factor": 0.5} + config.rope_parameters = { + "rope_type": "yarn", + "rope_theta": 10000.0, + "factor": factor, + "attention_factor": 0.5, + } _, attention_scale = rope_fn(config=config, device=torch_device, seq_len=1) self.assertEqual(attention_scale, 0.5) @@ -266,7 +280,13 @@ def test_yarn_rope_numerically(self): # (note: adds a margin to the test for numerical stability) factor = 10.0 margin = 1e-8 - config.rope_scaling = {"rope_type": "yarn", "factor": factor, "beta_fast": 32, "beta_slow": 1} + config.rope_parameters = { + "rope_type": "yarn", + "rope_theta": 10000.0, + "factor": factor, + "beta_fast": 32, + "beta_slow": 1, + } inv_freq, _ = rope_fn(config=config, device=torch_device) is_bounded_by_factor = [ ((default_inv_freq[idx] / factor) - margin) <= yarn_inv_freq_value <= (default_inv_freq[idx] + margin) @@ -276,7 +296,13 @@ def test_yarn_rope_numerically(self): # super high beta_fast = interpolation (i.e. scaling) in all but the first inverse frequency. The last ~20 # values (empirically checked for `beta_fast` = 1000) should be very small to linear scaling - config.rope_scaling = {"rope_type": "yarn", "factor": factor, "beta_fast": 1000, "beta_slow": 1} + config.rope_parameters = { + "rope_type": "yarn", + "rope_theta": 10000.0, + "factor": factor, + "beta_fast": 1000, + "beta_slow": 1, + } inv_freq, _ = rope_fn(config=config, device=torch_device) is_interpolating = [ yarn_inv_freq_value < (default_inv_freq[idx] + margin) for idx, yarn_inv_freq_value in enumerate(inv_freq) @@ -286,17 +312,22 @@ def test_yarn_rope_numerically(self): torch.testing.assert_close(inv_freq[-20:], default_inv_freq[-20:] / factor) # Check 3: numerical snapshot to avoid regressions - config.rope_scaling = {"rope_type": "yarn", "factor": factor, "beta_fast": 32, "beta_slow": 1} + config.rope_parameters = { + "rope_type": "yarn", + "rope_theta": 10000.0, + "factor": factor, + "beta_fast": 32, + "beta_slow": 1, + } inv_freq, _ = rope_fn(config=config, device=torch_device) torch.testing.assert_close(inv_freq, EXPECTED_INV_FREQ) def test_longrope_rope_numerically(self): # input sanity checks: if these change, the output will also change config = LlamaConfig() - self.assertEqual(config.rope_scaling, None) + self.assertEqual(config.rope_parameters, {"rope_type": "default", "rope_theta": 10000.0}) self.assertEqual(config.hidden_size, 4096) self.assertEqual(config.num_attention_heads, 32) - self.assertEqual(config.rope_theta, 10000.0) self.assertFalse(hasattr(config, "partial_rotary_factor")) # longrope applies scaling on EACH inv frequency, `short_factor` or `long_factor`, depending on the seq_len @@ -304,7 +335,7 @@ def test_longrope_rope_numerically(self): short_factor = [2.0] * (dim // 2) # scaling applied when seq_len <= max_position_embeddings long_factor = torch.ones(dim // 2).cumsum(0).tolist() # scaling applied when seq_len > max_position_embeddings - rope_fn = ROPE_INIT_FUNCTIONS["default"] + rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters default_inv_freq, _ = rope_fn(config=config, device=torch_device) # Check 1: according to the paper, if `attention_factor` is not specified, then it has a specific default -- @@ -312,8 +343,9 @@ def test_longrope_rope_numerically(self): rope_fn = ROPE_INIT_FUNCTIONS["longrope"] max_position_embeddings = config.max_position_embeddings for factor in (2.0, 10.0, 20.0): - config.rope_scaling = { + config.rope_parameters = { "rope_type": "longrope", + "rope_theta": 10000.0, "factor": factor, "short_factor": short_factor, "long_factor": long_factor, @@ -321,8 +353,9 @@ def test_longrope_rope_numerically(self): _, attention_scale = rope_fn(config=config, device=torch_device) self.assertEqual(attention_scale, math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))) - config.rope_scaling = { + config.rope_parameters = { "rope_type": "longrope", + "rope_theta": 10000.0, "factor": factor, "short_factor": short_factor, "long_factor": long_factor, @@ -331,19 +364,21 @@ def test_longrope_rope_numerically(self): _, attention_scale = rope_fn(config=config, device=torch_device, seq_len=1) self.assertEqual(attention_scale, 0.5) - config.rope_scaling = { + config.rope_parameters = { "rope_type": "longrope", + "rope_theta": 10000.0, "factor": factor, "short_factor": short_factor, "long_factor": long_factor, } - self.assertEqual(config.rope_scaling.get("attention_factor"), None) + self.assertEqual(config.rope_parameters.get("attention_factor"), None) # Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised. rope_config_validation(config) # Check 2: seq_len == 0 -> short factor is applied to the default frequencies - config.rope_scaling = { + config.rope_parameters = { "rope_type": "longrope", + "rope_theta": 10000.0, "factor": 1.0, "short_factor": short_factor, "long_factor": long_factor, @@ -376,20 +411,20 @@ def test_llama3_rope_numerically(self): # input sanity checks: if these change, the output will also change config = LlamaConfig() - self.assertEqual(config.rope_scaling, None) + self.assertEqual(config.rope_parameters, {"rope_type": "default", "rope_theta": 10000.0}) self.assertEqual(config.hidden_size, 4096) self.assertEqual(config.num_attention_heads, 32) - self.assertEqual(config.rope_theta, 10000.0) self.assertFalse(hasattr(config, "partial_rotary_factor")) - rope_fn = ROPE_INIT_FUNCTIONS["default"] + rope_fn = LlamaRotaryEmbedding.compute_default_rope_parameters default_inv_freq, _ = rope_fn(config=config, device=torch_device) # Check 1: `attention_factor` is always 1 rope_fn = ROPE_INIT_FUNCTIONS["llama3"] for factor in (2.0, 10.0, 20.0): - config.rope_scaling = { + config.rope_parameters = { "rope_type": "llama3", + "rope_theta": 10000.0, "factor": factor, "original_max_position_embeddings": 2048, "low_freq_factor": 1, @@ -403,8 +438,9 @@ def test_llama3_rope_numerically(self): # frequencies are scaled by a value in between. Changing `low_freq_factor` and `high_freq_factor` changes what # is considered low, medium, and high frequencies. factor = 10.0 - config.rope_scaling = { + config.rope_parameters = { "rope_type": "llama3", + "rope_theta": 10000.0, "factor": factor, "original_max_position_embeddings": 2048, "low_freq_factor": 1, @@ -419,8 +455,9 @@ def test_llama3_rope_numerically(self): # if we change `high_freq_factor` to a very high value, none is considered high-frequency -> ALL values will be # scaled - config.rope_scaling = config.rope_scaling = { + config.rope_parameters = config.rope_parameters = { "rope_type": "llama3", + "rope_theta": 10000.0, "factor": factor, "original_max_position_embeddings": 2048, "low_freq_factor": 1, @@ -431,8 +468,9 @@ def test_llama3_rope_numerically(self): self.assertTrue(all(is_scaled)) # Check 3: numerical snapshot to avoid regressions - config.rope_scaling = { + config.rope_parameters = { "rope_type": "llama3", + "rope_theta": 10000.0, "factor": factor, "original_max_position_embeddings": 2048, "low_freq_factor": 1, diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 613368e1477e..6d777fd77082 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -711,7 +711,11 @@ def match_docstring_with_signature(obj: Any) -> Optional[tuple[str, str]]: elif re.search(r"^\s*#\s*ignore-order\s*$", line_before_docstring): ignore_order = True - # Read the signature + # Read the signature. Skip on `TypedDict` objects for now. Inspect cannot + # parse their signature ("no signature found for builtin type ") + if issubclass(obj, dict) and hasattr(obj, "__annotations__"): + return + signature = inspect.signature(obj).parameters obj_doc_lines = obj.__doc__.split("\n")