From 31cd5593c5718a1e0071c4930464135b46784cda Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Mon, 11 May 2026 17:07:19 +0900 Subject: [PATCH 1/3] fix(rope): read original_max_position_embeddings from yarn validator's argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_validate_yarn_rope_parameters` is called by `validate_rope` once per per-attention-type sub-dict, with the sub-dict passed as the `rope_parameters` argument. The `factor` consistency check inside the function however reads `original_max_position_embeddings` from `self.rope_parameters[...]` instead of from the argument, which raises `KeyError` for any config that keeps the nested `{full_attention, sliding_attention, ...}` shape — the per-type sub-dicts are inside one of those keys, not at the top level. Other rope validators in the same file (`_validate_default_rope_parameters`, `_validate_linear_rope_parameters`, etc.) all read from the function argument, so this matches their pattern. --- src/transformers/modeling_rope_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 3a0dcf345280..c487de89cace 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -876,7 +876,7 @@ def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: set # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. # NOTE: we might get `implicit_factor == 1` if config's `original_max_position_embeddings` was # inferred from `max_position_embeddings` during standardization - original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"] + original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] implicit_factor = self.max_position_embeddings / original_max_position_embeddings if implicit_factor != factor and implicit_factor != 1: logger.warning_once( From 4a08efc75bdd19723ba7a4c56474e3e9d87395be Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Mon, 11 May 2026 21:58:34 +0900 Subject: [PATCH 2/3] test(rope): mirror test_rope_validation for per-attention-type nested rope_parameters --- tests/utils/test_modeling_rope_utils.py | 58 +++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 79ab37d9e2f3..eadb287a9fe3 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -136,6 +136,64 @@ def test_yarn_original_original_max_position_embeddings_validation(self): self.assertEqual(len(logs.output), 1) self.assertIn("implicit factor", logs.output[0]) + def test_rope_validation_with_per_attention_type_nested_rope(self): + """Mirrors `test_rope_validation` with `config.layer_types` set, so that + `rope_parameters` takes the per-attention-type nested shape.""" + config = LlamaConfig() + all_rope_types = ROPE_INIT_FUNCTIONS.keys() + config.layer_types = ["full_attention", "sliding_attention"] + + def nest(full_attention_params): + return { + "full_attention": full_attention_params, + "sliding_attention": {"rope_type": "default", "rope_theta": 10000.0}, + } + + # Each non-default RoPE type with only `rope_theta` should still raise + # KeyError (missing required keys) when wrapped in the nested shape. + for rope_type in all_rope_types: + if rope_type in ("default", "proportional"): + continue + config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0}) + with self.assertRaises(KeyError): + config.validate_rope() + + # Parameters exclusive to a RoPE type should still raise when passed to + # the wrong type while in the nested shape. + valid_param_mapping = { + "factor": ["linear", "dynamic", "yarn", "longrope"], + "attention_factor": ["yarn", "longrope"], + "beta_fast": ["yarn"], + "beta_slow": ["yarn"], + "short_factor": ["longrope"], + "long_factor": ["longrope"], + } + for rope_type in all_rope_types: + if rope_type in ("default", "proportional"): + continue + for param, valid_rope_types in valid_param_mapping.items(): + config.rope_parameters = nest( + {"rope_type": rope_type, "rope_theta": 10000.0, param: True} + ) + if rope_type in valid_rope_types: + continue + with self.assertRaises(KeyError): + config.validate_rope() + + # A complete yarn entry under the nested shape should validate cleanly. + # Regression: previously the implicit-factor check inside the yarn + # validator dereferenced `self.rope_parameters` (the full nested dict) + # rather than its per-type `rope_parameters` argument. + config.rope_parameters = nest( + { + "rope_type": "yarn", + "rope_theta": 10000.0, + "factor": 2.0, + "original_max_position_embeddings": int(config.max_position_embeddings / 2.0), + } + ) + config.validate_rope() + def test_default_rope_numerically(self): # Note: some RoPE scaling methods start off by calling the default RoPE frequencies. If this test fails, then # multiple RoPE strategies will fail. From 5bd9811b5227f0e65b8a2257150ee360448f5637 Mon Sep 17 00:00:00 2001 From: Minho Ryu Date: Tue, 12 May 2026 15:04:32 +0900 Subject: [PATCH 3/3] test(rope): apply ruff format to nested-rope test --- tests/utils/test_modeling_rope_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index eadb287a9fe3..3240a74bf838 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -172,9 +172,7 @@ def nest(full_attention_params): if rope_type in ("default", "proportional"): continue for param, valid_rope_types in valid_param_mapping.items(): - config.rope_parameters = nest( - {"rope_type": rope_type, "rope_theta": 10000.0, param: True} - ) + config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0, param: True}) if rope_type in valid_rope_types: continue with self.assertRaises(KeyError):