vllm-project · HollowMan6 · Mar 15, 2026
@@ -263,16 +263,12 @@ def __init__(
                 if isinstance(group.kv_cache_spec, AttentionSpec):
                     self.routed_experts_attn_gid = gid
                     break
-            min_block_size = min(
-                [
-                    group.kv_cache_spec.block_size
-                    for group in kv_cache_config.kv_cache_groups
-                ]
-            )
-            num_groups = len(kv_cache_config.kv_cache_groups)
+            attn_group = kv_cache_config.kv_cache_groups[self.routed_experts_attn_gid]
+            # Routed experts are read back with the attention group's slot_mapping,
+            # so the shared-memory view must match that group's full KV address space.
             self.max_num_kv_tokens = (
-                kv_cache_config.num_blocks // num_groups
-            ) * min_block_size
+                kv_cache_config.num_blocks * attn_group.kv_cache_spec.block_size
+            )
             dcp_size = self.vllm_config.parallel_config.decode_context_parallel_size
             pcp_size = self.vllm_config.parallel_config.prefill_context_parallel_size
             if pcp_size * dcp_size > 1:

@@ -6543,16 +6543,12 @@ def init_routed_experts_capturer(self):
         )
         routed_experts_capturer = RoutedExpertsCapturer.create()
         self.routed_experts_attn_gid = self._get_attention_kv_cache_gid()
-        min_block_size = min(
-            [
-                group.kv_cache_spec.block_size
-                for group in self.kv_cache_config.kv_cache_groups
-            ]
-        )
-        num_groups = len(self.kv_cache_config.kv_cache_groups)
+        attn_group = self.kv_cache_config.kv_cache_groups[self.routed_experts_attn_gid]
+        # Routed experts are indexed with the attention group's slot_mapping,
+        # so the side buffer must match that group's full KV address space.
         self.max_num_kv_tokens = (
-            self.kv_cache_config.num_blocks // num_groups
-        ) * min_block_size
+            self.kv_cache_config.num_blocks * attn_group.kv_cache_spec.block_size
+        )
         dcp_size = self.vllm_config.parallel_config.decode_context_parallel_size
         pcp_size = self.vllm_config.parallel_config.prefill_context_parallel_size
         if pcp_size * dcp_size > 1: