Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/transformers/modeling_rope_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: set
# Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths.
# NOTE: we might get `implicit_factor == 1` if config's `original_max_position_embeddings` was
# inferred from `max_position_embeddings` during standardization
original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"]
original_max_position_embeddings = rope_parameters["original_max_position_embeddings"]
implicit_factor = self.max_position_embeddings / original_max_position_embeddings
if implicit_factor != factor and implicit_factor != 1:
logger.warning_once(
Expand Down
56 changes: 56 additions & 0 deletions tests/utils/test_modeling_rope_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,62 @@ def test_yarn_original_original_max_position_embeddings_validation(self):
self.assertEqual(len(logs.output), 1)
self.assertIn("implicit factor", logs.output[0])

def test_rope_validation_with_per_attention_type_nested_rope(self):
"""Mirrors `test_rope_validation` with `config.layer_types` set, so that
`rope_parameters` takes the per-attention-type nested shape."""
config = LlamaConfig()
all_rope_types = ROPE_INIT_FUNCTIONS.keys()
config.layer_types = ["full_attention", "sliding_attention"]

def nest(full_attention_params):
return {
"full_attention": full_attention_params,
"sliding_attention": {"rope_type": "default", "rope_theta": 10000.0},
}

# Each non-default RoPE type with only `rope_theta` should still raise
# KeyError (missing required keys) when wrapped in the nested shape.
for rope_type in all_rope_types:
if rope_type in ("default", "proportional"):
continue
config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0})
with self.assertRaises(KeyError):
config.validate_rope()

# Parameters exclusive to a RoPE type should still raise when passed to
# the wrong type while in the nested shape.
valid_param_mapping = {
"factor": ["linear", "dynamic", "yarn", "longrope"],
"attention_factor": ["yarn", "longrope"],
"beta_fast": ["yarn"],
"beta_slow": ["yarn"],
"short_factor": ["longrope"],
"long_factor": ["longrope"],
}
for rope_type in all_rope_types:
if rope_type in ("default", "proportional"):
continue
for param, valid_rope_types in valid_param_mapping.items():
config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0, param: True})
if rope_type in valid_rope_types:
continue
with self.assertRaises(KeyError):
config.validate_rope()

# A complete yarn entry under the nested shape should validate cleanly.
# Regression: previously the implicit-factor check inside the yarn
# validator dereferenced `self.rope_parameters` (the full nested dict)
# rather than its per-type `rope_parameters` argument.
config.rope_parameters = nest(
{
"rope_type": "yarn",
"rope_theta": 10000.0,
"factor": 2.0,
"original_max_position_embeddings": int(config.max_position_embeddings / 2.0),
}
)
config.validate_rope()

def test_default_rope_numerically(self):
# Note: some RoPE scaling methods start off by calling the default RoPE frequencies. If this test fails, then
# multiple RoPE strategies will fail.
Expand Down
Loading