vllm-project · wangxiyuan · Jan 5, 2026 · Dec 30, 2025 · gemini-code-assist · Dec 30, 2025
@@ -129,6 +129,18 @@ def _record_cos_and_sin_cache(cos_cache, sin_cache):
     _sin_cache = sin_cache
 
 
+def _record_cos_and_sin_cache_interleaved(cos_sin_cache):
+    global _cos_cache
+    global _sin_cache
+    if _cos_cache is not None or _sin_cache is not None:
+        return
+    hidden_dim = cos_sin_cache.shape[-1] // 2
+    cos_cache, sin_cache = cos_sin_cache.view(-1, 2, hidden_dim).repeat(
+        1, 1, 2).chunk(2, dim=1)
+    _cos_cache = cos_cache.squeeze(1)
+    _sin_cache = sin_cache.squeeze(1)
-def _record_cos_and_sin_cache_interleaved(cos_sin_cache):
-    global _cos_cache
-    global _sin_cache
-    if _cos_cache is not None or _sin_cache is not None:
-        return
-    hidden_dim = cos_sin_cache.shape[-1] // 2
-    cos_cache, sin_cache = cos_sin_cache.view(-1, 2, hidden_dim).repeat(
-        1, 1, 2).chunk(2, dim=1)
-    _cos_cache = cos_cache.squeeze(1)
-    _sin_cache = sin_cache.squeeze(1)
+def _record_cos_and_sin_cache_interleaved(cos_sin_cache):
+    global _cos_cache
+    global _sin_cache
+    if _cos_cache is not None or _sin_cache is not None:
+        return
+
+    # cos_sin_cache is concatenated from cos and sin, each of size rotary_dim/2.
+    cos_part, sin_part = cos_sin_cache.chunk(2, dim=-1)
+
+    # For neox style, cos and sin are duplicated to match rotary_dim.
+    _cos_cache = cos_part.repeat(1, 2)
+    _sin_cache = sin_part.repeat(1, 2)
-def _record_cos_and_sin_cache_interleaved(cos_sin_cache):
-    global _cos_cache
-    global _sin_cache
-    if _cos_cache is not None or _sin_cache is not None:
-        return
-    hidden_dim = cos_sin_cache.shape[-1] // 2
-    cos_cache, sin_cache = cos_sin_cache.view(-1, 2, hidden_dim).repeat(
-        1, 1, 2).chunk(2, dim=1)
-    _cos_cache = cos_cache.squeeze(1)
-    _sin_cache = sin_cache.squeeze(1)
+def _record_cos_and_sin_cache_interleaved(cos_sin_cache):
+    global _cos_cache
+    global _sin_cache
+    if _cos_cache is not None or _sin_cache is not None:
+        return
+
+    # cos_sin_cache is concatenated from cos and sin, each of size rotary_dim/2.
+    cos_part, sin_part = cos_sin_cache.chunk(2, dim=-1)
+
+    # For neox style, cos and sin are duplicated to match rotary_dim.
+    _cos_cache = cos_part.repeat(1, 2)
+    _sin_cache = sin_part.repeat(1, 2)
+
+
 def update_cos_sin(positions):
     global _cos
     global _sin
@@ -252,6 +264,7 @@ def __init__(
         super().__init__(head_size, rotary_dim, max_position_embeddings, base,
                          is_neox_style, dtype)
         _record_cos_sin_cache(self.cos_sin_cache)
+        _record_cos_and_sin_cache_interleaved(self.cos_sin_cache)
 
     def forward_oot(
         self,