huggingface · ArthurZucker · Apr 8, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1271,3 +1271,7 @@ def is_sliding(self):
     @property
     def is_compileable(self) -> bool:
         return self.self_attention_cache.is_compileable
+
+
+# Deprecated alias: SlidingWindowCache was removed in transformers v5. StaticCache is the replacement.
+SlidingWindowCache = StaticCache
diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
@@ -555,10 +555,47 @@ def _compute_llama3_parameters(
     return inv_freq_llama, attention_factor
 
 
+def _compute_default_rope_parameters(
+    config: Optional["PreTrainedConfig"] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: int | None = None,
+    layer_type: str | None = None,
+) -> tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for the default RoPE implementation (no scaling).
+
+    Args:
+        config ([`~transformers.PreTrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        layer_type (`str`, *optional*):
+            The layer type for per-layer rope configs.
+
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    config.standardize_rope_params()
+    rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters
+
+    base = rope_parameters_dict.get("rope_theta", getattr(config, "rope_theta", config.default_theta))
+    partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0)
+    head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+    dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
+    return inv_freq, attention_factor
+
+
 # This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
 # from the model config. You can append new {'rope_type': callable} pairs to this rope_parameters to enable custom RoPE
 # parameterizations, as long as the callable has the same signature.
 ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
     "linear": _compute_linear_scaling_rope_parameters,
     "dynamic": _compute_dynamic_ntk_parameters,
     "yarn": _compute_yarn_parameters,
@@ -699,10 +736,17 @@ def standardize_rope_params(self):
 
         self.rope_parameters = rope_parameters
 
-    def validate_rope(self: "PreTrainedConfig"):
+    def validate_rope(self: "PreTrainedConfig", ignore_keys: set | None = None):
         """
         Validate the RoPE config arguments, given a `"PreTrainedConfig"` object
+
+        Args:
+            ignore_keys (`set`, *optional*):
+                Keys to ignore during validation. If provided, sets `ignore_keys_at_rope_validation` on the config.
+                Deprecated: set `config.ignore_keys_at_rope_validation` directly instead.
         """
+        if ignore_keys is not None:
+            self.ignore_keys_at_rope_validation = self.ignore_keys_at_rope_validation | ignore_keys
         # Don't validate if no rope_parameters found (`None`) or if it's an empty dict
         # Note that validation runs every time a new config is created, even if config is non-RoPE
         rope_parameters_dict = getattr(self, "rope_parameters", None)
@@ -729,10 +773,13 @@ def validate_rope(self: "PreTrainedConfig"):
                 )
 
     def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None):
-        required_keys = {"rope_type", "rope_theta"}
+        required_keys = {"rope_type"}
+        optional_keys = {"rope_theta"}
         received_keys = set(rope_parameters.keys())
         rope_type = rope_parameters["rope_type"]
-        self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
+        self._check_received_keys(
+            rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys
+        )
 
     def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None):
         required_keys = {"rope_type", "factor", "rope_theta"}

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4610,7 +4610,7 @@ def mark_tied_weights_as_initialized(self, loading_info):
         later as they will be tied (overwritten) anyway.
         This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so
         running inits on them is very costly."""
-        for tied_param in self.all_tied_weights_keys.keys():
+        for tied_param in getattr(self, "all_tied_weights_keys", {}).keys():
             param = self.get_parameter(tied_param)
             param._is_hf_initialized = True
 

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -1277,7 +1277,9 @@ def __getattr__(self, key):
 
         # Named special tokens (bos_token, eos_token, etc.)
         if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
-            token_value = self._special_tokens_map.get(key_without_id)
+            # Use __dict__.get to avoid recursive __getattr__ when _special_tokens_map
+            # is not yet initialized (e.g. during fast tokenizer __init__)
+            token_value = self.__dict__.get("_special_tokens_map", {}).get(key_without_id)
             if token_value is None:
                 if self.verbose:
                     logger.error(f"Using {key}, but it is not set yet.")
@@ -1286,7 +1288,7 @@ def __getattr__(self, key):
 
         # Extra special tokens
         if key_without_id == "extra_special_tokens":
-            tokens = [str(tok) for tok in self._extra_special_tokens]
+            tokens = [str(tok) for tok in self.__dict__.get("_extra_special_tokens", [])]
             return self.convert_tokens_to_ids(tokens) if key != key_without_id else tokens
 
         if key not in self.__dict__:

diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
@@ -3579,6 +3579,11 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         if kwarg_param.annotation == inspect.Parameter.empty:
             continue
 
+        if not hasattr(kwarg_param.annotation, "__args__") or not hasattr(
+            kwarg_param.annotation.__args__[0], "__name__"
+        ):
+            continue
+
         if kwarg_param.annotation.__args__[0].__name__ not in BASIC_KWARGS_TYPES:
             # Extract documentation for kwargs
             kwargs_documentation = kwarg_param.annotation.__args__[0].__doc__