Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/transformers/cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,3 +1271,7 @@ def is_sliding(self):
@property
def is_compileable(self) -> bool:
return self.self_attention_cache.is_compileable


# Deprecated alias: SlidingWindowCache was removed in transformers v5. StaticCache is the replacement.
SlidingWindowCache = StaticCache
53 changes: 50 additions & 3 deletions src/transformers/modeling_rope_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -555,10 +555,47 @@ def _compute_llama3_parameters(
return inv_freq_llama, attention_factor


def _compute_default_rope_parameters(
config: Optional["PreTrainedConfig"] = None,
device: Optional["torch.device"] = None,
seq_len: int | None = None,
layer_type: str | None = None,
) -> tuple["torch.Tensor", float]:
"""
Computes the inverse frequencies for the default RoPE implementation (no scaling).

Args:
config ([`~transformers.PreTrainedConfig`]):
The model configuration.
device (`torch.device`):
The device to use for initialization of the inverse frequencies.
seq_len (`int`, *optional*):
The current sequence length. Unused for this type of RoPE.
layer_type (`str`, *optional*):
The layer type for per-layer rope configs.

Returns:
Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
"""
config.standardize_rope_params()
rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters

base = rope_parameters_dict.get("rope_theta", getattr(config, "rope_theta", config.default_theta))
partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0)
head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
dim = int(head_dim * partial_rotary_factor)
attention_factor = 1.0 # Unused in this type of RoPE

inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim))
return inv_freq, attention_factor


# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
# from the model config. You can append new {'rope_type': callable} pairs to this rope_parameters to enable custom RoPE
# parameterizations, as long as the callable has the same signature.
ROPE_INIT_FUNCTIONS = {
"default": _compute_default_rope_parameters,

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remote code BC?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @zucchini-nlp

It is true that remote code won't have it, but we likely would also need to refactor a lot of models, seems risky; especially for models that do have a different default init so we need to check if some code exists first and then use it as fallback

"linear": _compute_linear_scaling_rope_parameters,
"dynamic": _compute_dynamic_ntk_parameters,
"yarn": _compute_yarn_parameters,
Expand Down Expand Up @@ -699,10 +736,17 @@ def standardize_rope_params(self):

self.rope_parameters = rope_parameters

def validate_rope(self: "PreTrainedConfig"):
def validate_rope(self: "PreTrainedConfig", ignore_keys: set | None = None):
"""
Validate the RoPE config arguments, given a `"PreTrainedConfig"` object

Args:
ignore_keys (`set`, *optional*):
Keys to ignore during validation. If provided, sets `ignore_keys_at_rope_validation` on the config.
Deprecated: set `config.ignore_keys_at_rope_validation` directly instead.
"""
if ignore_keys is not None:
self.ignore_keys_at_rope_validation = self.ignore_keys_at_rope_validation | ignore_keys
# Don't validate if no rope_parameters found (`None`) or if it's an empty dict
# Note that validation runs every time a new config is created, even if config is non-RoPE
rope_parameters_dict = getattr(self, "rope_parameters", None)
Expand All @@ -729,10 +773,13 @@ def validate_rope(self: "PreTrainedConfig"):
)

def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None):
required_keys = {"rope_type", "rope_theta"}
required_keys = {"rope_type"}
optional_keys = {"rope_theta"}

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

likely needs to be done elsewhere then too? I think all need rope theta?

Comment on lines +805 to +806

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, this kinda defeats the point of validation because a RoPE dict with no theta isn't valid for our modules

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah but we always default to default_theta if its not there no?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

validation always happens after teh defaults are set, so ideally it shouldn't raise an error. Do we know why the theta was missing?

received_keys = set(rope_parameters.keys())
rope_type = rope_parameters["rope_type"]
self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys)
self._check_received_keys(
rope_type, received_keys, required_keys, optional_keys=optional_keys, ignore_keys=ignore_keys
)

def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: set | None = None):
required_keys = {"rope_type", "factor", "rope_theta"}
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4610,7 +4610,7 @@ def mark_tied_weights_as_initialized(self, loading_info):
later as they will be tied (overwritten) anyway.
This is very important as most embeddings are tied, and they are huge params (vocabularies are often 256k), so
running inits on them is very costly."""
for tied_param in self.all_tied_weights_keys.keys():
for tied_param in getattr(self, "all_tied_weights_keys", {}).keys():

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix remote code

param = self.get_parameter(tied_param)
param._is_hf_initialized = True

Expand Down
6 changes: 4 additions & 2 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,9 @@ def __getattr__(self, key):

# Named special tokens (bos_token, eos_token, etc.)
if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
token_value = self._special_tokens_map.get(key_without_id)
# Use __dict__.get to avoid recursive __getattr__ when _special_tokens_map
# is not yet initialized (e.g. during fast tokenizer __init__)
token_value = self.__dict__.get("_special_tokens_map", {}).get(key_without_id)

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same easy fix for remote code

if token_value is None:
if self.verbose:
logger.error(f"Using {key}, but it is not set yet.")
Expand All @@ -1286,7 +1288,7 @@ def __getattr__(self, key):

# Extra special tokens
if key_without_id == "extra_special_tokens":
tokens = [str(tok) for tok in self._extra_special_tokens]
tokens = [str(tok) for tok in self.__dict__.get("_extra_special_tokens", [])]
return self.convert_tokens_to_ids(tokens) if key != key_without_id else tokens

if key not in self.__dict__:
Expand Down
5 changes: 5 additions & 0 deletions src/transformers/utils/auto_docstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -3579,6 +3579,11 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
if kwarg_param.annotation == inspect.Parameter.empty:
continue

if not hasattr(kwarg_param.annotation, "__args__") or not hasattr(
kwarg_param.annotation.__args__[0], "__name__"
):
continue

if kwarg_param.annotation.__args__[0].__name__ not in BASIC_KWARGS_TYPES:
# Extract documentation for kwargs
kwargs_documentation = kwarg_param.annotation.__args__[0].__doc__
Expand Down
Loading