From 31cd5593c5718a1e0071c4930464135b46784cda Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Mon, 11 May 2026 17:07:19 +0900
Subject: [PATCH 1/3] fix(rope): read original_max_position_embeddings from
 yarn validator's argument
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`_validate_yarn_rope_parameters` is called by `validate_rope` once per
per-attention-type sub-dict, with the sub-dict passed as the `rope_parameters`
argument. The `factor` consistency check inside the function however reads
`original_max_position_embeddings` from `self.rope_parameters[...]` instead
of from the argument, which raises `KeyError` for any config that keeps the
nested `{full_attention, sliding_attention, ...}` shape — the per-type
sub-dicts are inside one of those keys, not at the top level.

Other rope validators in the same file (`_validate_default_rope_parameters`,
`_validate_linear_rope_parameters`, etc.) all read from the function argument,
so this matches their pattern.
---
 src/transformers/modeling_rope_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py
index 3a0dcf345280..c487de89cace 100644
--- a/src/transformers/modeling_rope_utils.py
+++ b/src/transformers/modeling_rope_utils.py
@@ -876,7 +876,7 @@ def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: set
         # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths.
         # NOTE: we might get `implicit_factor == 1` if config's `original_max_position_embeddings` was
         # inferred from `max_position_embeddings` during standardization
-        original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"]
+        original_max_position_embeddings = rope_parameters["original_max_position_embeddings"]
         implicit_factor = self.max_position_embeddings / original_max_position_embeddings
         if implicit_factor != factor and implicit_factor != 1:
             logger.warning_once(

From 4a08efc75bdd19723ba7a4c56474e3e9d87395be Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Mon, 11 May 2026 21:58:34 +0900
Subject: [PATCH 2/3] test(rope): mirror test_rope_validation for
 per-attention-type nested rope_parameters

---
 tests/utils/test_modeling_rope_utils.py | 58 +++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index 79ab37d9e2f3..eadb287a9fe3 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -136,6 +136,64 @@ def test_yarn_original_original_max_position_embeddings_validation(self):
             self.assertEqual(len(logs.output), 1)
             self.assertIn("implicit factor", logs.output[0])
 
+    def test_rope_validation_with_per_attention_type_nested_rope(self):
+        """Mirrors `test_rope_validation` with `config.layer_types` set, so that
+        `rope_parameters` takes the per-attention-type nested shape."""
+        config = LlamaConfig()
+        all_rope_types = ROPE_INIT_FUNCTIONS.keys()
+        config.layer_types = ["full_attention", "sliding_attention"]
+
+        def nest(full_attention_params):
+            return {
+                "full_attention": full_attention_params,
+                "sliding_attention": {"rope_type": "default", "rope_theta": 10000.0},
+            }
+
+        # Each non-default RoPE type with only `rope_theta` should still raise
+        # KeyError (missing required keys) when wrapped in the nested shape.
+        for rope_type in all_rope_types:
+            if rope_type in ("default", "proportional"):
+                continue
+            config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0})
+            with self.assertRaises(KeyError):
+                config.validate_rope()
+
+        # Parameters exclusive to a RoPE type should still raise when passed to
+        # the wrong type while in the nested shape.
+        valid_param_mapping = {
+            "factor": ["linear", "dynamic", "yarn", "longrope"],
+            "attention_factor": ["yarn", "longrope"],
+            "beta_fast": ["yarn"],
+            "beta_slow": ["yarn"],
+            "short_factor": ["longrope"],
+            "long_factor": ["longrope"],
+        }
+        for rope_type in all_rope_types:
+            if rope_type in ("default", "proportional"):
+                continue
+            for param, valid_rope_types in valid_param_mapping.items():
+                config.rope_parameters = nest(
+                    {"rope_type": rope_type, "rope_theta": 10000.0, param: True}
+                )
+                if rope_type in valid_rope_types:
+                    continue
+                with self.assertRaises(KeyError):
+                    config.validate_rope()
+
+        # A complete yarn entry under the nested shape should validate cleanly.
+        # Regression: previously the implicit-factor check inside the yarn
+        # validator dereferenced `self.rope_parameters` (the full nested dict)
+        # rather than its per-type `rope_parameters` argument.
+        config.rope_parameters = nest(
+            {
+                "rope_type": "yarn",
+                "rope_theta": 10000.0,
+                "factor": 2.0,
+                "original_max_position_embeddings": int(config.max_position_embeddings / 2.0),
+            }
+        )
+        config.validate_rope()
+
     def test_default_rope_numerically(self):
         # Note: some RoPE scaling methods start off by calling the default RoPE frequencies. If this test fails, then
         # multiple RoPE strategies will fail.

From 5bd9811b5227f0e65b8a2257150ee360448f5637 Mon Sep 17 00:00:00 2001
From: Minho Ryu <ryumin93@gmail.com>
Date: Tue, 12 May 2026 15:04:32 +0900
Subject: [PATCH 3/3] test(rope): apply ruff format to nested-rope test

---
 tests/utils/test_modeling_rope_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py
index eadb287a9fe3..3240a74bf838 100644
--- a/tests/utils/test_modeling_rope_utils.py
+++ b/tests/utils/test_modeling_rope_utils.py
@@ -172,9 +172,7 @@ def nest(full_attention_params):
             if rope_type in ("default", "proportional"):
                 continue
             for param, valid_rope_types in valid_param_mapping.items():
-                config.rope_parameters = nest(
-                    {"rope_type": rope_type, "rope_theta": 10000.0, param: True}
-                )
+                config.rope_parameters = nest({"rope_type": rope_type, "rope_theta": 10000.0, param: True})
                 if rope_type in valid_rope_types:
                     continue
                 with self.assertRaises(KeyError):