From ae35220fa2faedd9b7ad1e57842ad0de06e09172 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk@inferact.ai>
Date: Wed, 4 Mar 2026 00:01:29 +0000
Subject: [PATCH 1/3] [Model Runner V2] Use dictionary instead of tuple for
 execute_model_state

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/model_runner.py | 62 ++++++++++++++----------------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 9267e187415f..a73e203cda38 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -217,7 +217,7 @@ def __init__(
         self.pooling_runner: PoolingRunner | None = None
 
         # For transferring state from execute_model to subsequent sample_tokens call.
-        self.execute_model_state: tuple | None = None
+        self.execute_model_state: dict | None = None
 
     def update_max_model_len(self, max_model_len: int) -> None:
         self.max_model_len = max_model_len
@@ -379,16 +379,12 @@ def _dummy_run(
             return None, None
 
         assert self.execute_model_state is not None
-        (
-            input_batch,
-            model_inputs,
-            attn_metadata,
-            slot_mappings_by_layer,
-            hidden_states,
-            aux_hidden_states,
-            kv_connector_output,
-            num_tokens_across_dp,
-        ) = self.execute_model_state
+        input_batch = self.execute_model_state["input_batch"]
+        attn_metadata = self.execute_model_state["attn_metadata"]
+        slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"]
+        hidden_states = self.execute_model_state["hidden_states"]
+        aux_hidden_states = self.execute_model_state["aux_hidden_states"]
+        num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"]
         self.execute_model_state = None
 
         # dummy run the eagle speculator's propose to ensure DP/EP sync.
@@ -990,16 +986,16 @@ def execute_model(
                     aux_hidden_states = None
 
         kv_connector_output = self.kv_connector.post_forward(scheduler_output)
-        self.execute_model_state = (
-            input_batch,
-            model_inputs,
-            attn_metadata,
-            slot_mappings_by_layer,
-            hidden_states,
-            aux_hidden_states,
-            kv_connector_output,
-            num_tokens_across_dp,
-        )
+        self.execute_model_state = {
+            "input_batch": input_batch,
+            "model_inputs": model_inputs,
+            "attn_metadata": attn_metadata,
+            "slot_mappings_by_layer": slot_mappings_by_layer,
+            "hidden_states": hidden_states,
+            "aux_hidden_states": aux_hidden_states,
+            "kv_connector_output": kv_connector_output,
+            "num_tokens_across_dp": num_tokens_across_dp,
+        }
 
         if not self.is_last_pp_rank:
             # Non-last PP rank: return IntermediateTensors for sending.
@@ -1017,16 +1013,14 @@ def sample_tokens(
         if self.execute_model_state is None:
             # The prior execute_model call must have failed.
             return None
-        (
-            input_batch,
-            model_inputs,
-            attn_metadata,
-            slot_mappings_by_layer,
-            hidden_states,
-            aux_hidden_states,
-            kv_connector_output,
-            num_tokens_across_dp,
-        ) = self.execute_model_state
+
+        input_batch = self.execute_model_state["input_batch"]
+        attn_metadata = self.execute_model_state["attn_metadata"]
+        slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"]
+        hidden_states = self.execute_model_state["hidden_states"]
+        aux_hidden_states = self.execute_model_state["aux_hidden_states"]
+        kv_connector_output = self.execute_model_state["kv_connector_output"]
+        num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"]
         self.execute_model_state = None
 
         if not self.is_last_pp_rank:
@@ -1117,9 +1111,9 @@ def pool(self) -> AsyncPoolingOutput | ModelRunnerOutput | None:
             # The prior execute_model call must have failed.
             return None
 
-        input_batch, _, _, _, hidden_states, _, kv_connector_output = (
-            self.execute_model_state
-        )
+        input_batch = self.execute_model_state["input_batch"]
+        hidden_states = self.execute_model_state["hidden_states"]
+        kv_connector_output = self.execute_model_state["kv_connector_output"]
         self.execute_model_state = None
 
         if not self.is_last_pp_rank:

From 81d1bffc56d7de2767f91790d013e618608afb86 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk@inferact.ai>
Date: Mon, 9 Mar 2026 16:43:54 +0000
Subject: [PATCH 2/3] [Model Runner V2] Add dummy profile_cudagraph_memory API

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index 8cb65c4d2647..c26fe9d6793d 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -473,6 +473,10 @@ def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int:
         # SP is not supported yet.
         return num_scheduled_tokens
 
+    def profile_cudagraph_memory(self) -> int:
+        # NOTE(woosuk): It is TBD whether we keep this API or not.
+        return 0
+
     @torch.inference_mode()
     def capture_model(self) -> int:
         if not self.cudagraph_manager.needs_capture():

From aba9c8526f79d9cee07e92dfad34e12a40b17268 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk@inferact.ai>
Date: Mon, 9 Mar 2026 16:48:55 +0000
Subject: [PATCH 3/3] NamedTuple

Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
---
 vllm/v1/worker/gpu/model_runner.py | 66 +++++++++++++++++-------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index b136f53801a9..b373b873aaf3 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -21,6 +21,7 @@
 import gc
 import time
 from copy import deepcopy
+from typing import Any, NamedTuple
 
 import numpy as np
 import torch
@@ -44,7 +45,7 @@
 from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig
-from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
+from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput
 from vllm.v1.worker.cp_utils import check_attention_cp_compatibility
 from vllm.v1.worker.gpu.async_utils import AsyncOutput, AsyncPoolingOutput
 from vllm.v1.worker.gpu.attn_utils import (
@@ -213,7 +214,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
         self.pooling_runner: PoolingRunner | None = None
 
         # For transferring state from execute_model to subsequent sample_tokens call.
-        self.execute_model_state: dict | None = None
+        self.execute_model_state: ExecuteModelState | None = None
 
     def update_max_model_len(self, max_model_len: int) -> None:
         self.max_model_len = max_model_len
@@ -375,12 +376,12 @@ def _dummy_run(
             return None, None
 
         assert self.execute_model_state is not None
-        input_batch = self.execute_model_state["input_batch"]
-        attn_metadata = self.execute_model_state["attn_metadata"]
-        slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"]
-        hidden_states = self.execute_model_state["hidden_states"]
-        aux_hidden_states = self.execute_model_state["aux_hidden_states"]
-        num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"]
+        input_batch = self.execute_model_state.input_batch
+        attn_metadata = self.execute_model_state.attn_metadata
+        slot_mappings_by_layer = self.execute_model_state.slot_mappings_by_layer
+        hidden_states = self.execute_model_state.hidden_states
+        aux_hidden_states = self.execute_model_state.aux_hidden_states
+        num_tokens_across_dp = self.execute_model_state.num_tokens_across_dp
         self.execute_model_state = None
 
         # dummy run the eagle speculator's propose to ensure DP/EP sync.
@@ -981,16 +982,15 @@ def execute_model(
                     aux_hidden_states = None
 
         kv_connector_output = self.kv_connector.post_forward(scheduler_output)
-        self.execute_model_state = {
-            "input_batch": input_batch,
-            "model_inputs": model_inputs,
-            "attn_metadata": attn_metadata,
-            "slot_mappings_by_layer": slot_mappings_by_layer,
-            "hidden_states": hidden_states,
-            "aux_hidden_states": aux_hidden_states,
-            "kv_connector_output": kv_connector_output,
-            "num_tokens_across_dp": num_tokens_across_dp,
-        }
+        self.execute_model_state = ExecuteModelState(
+            input_batch=input_batch,
+            attn_metadata=attn_metadata,
+            slot_mappings_by_layer=slot_mappings_by_layer,
+            hidden_states=hidden_states,
+            aux_hidden_states=aux_hidden_states,
+            kv_connector_output=kv_connector_output,
+            num_tokens_across_dp=num_tokens_across_dp,
+        )
 
         if not self.is_last_pp_rank:
             # Non-last PP rank: return IntermediateTensors for sending.
@@ -1009,13 +1009,13 @@ def sample_tokens(
             # The prior execute_model call must have failed.
             return None
 
-        input_batch = self.execute_model_state["input_batch"]
-        attn_metadata = self.execute_model_state["attn_metadata"]
-        slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"]
-        hidden_states = self.execute_model_state["hidden_states"]
-        aux_hidden_states = self.execute_model_state["aux_hidden_states"]
-        kv_connector_output = self.execute_model_state["kv_connector_output"]
-        num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"]
+        input_batch = self.execute_model_state.input_batch
+        attn_metadata = self.execute_model_state.attn_metadata
+        slot_mappings_by_layer = self.execute_model_state.slot_mappings_by_layer
+        hidden_states = self.execute_model_state.hidden_states
+        aux_hidden_states = self.execute_model_state.aux_hidden_states
+        kv_connector_output = self.execute_model_state.kv_connector_output
+        num_tokens_across_dp = self.execute_model_state.num_tokens_across_dp
         self.execute_model_state = None
 
         if not self.is_last_pp_rank:
@@ -1106,9 +1106,9 @@ def pool(self) -> AsyncPoolingOutput | ModelRunnerOutput | None:
             # The prior execute_model call must have failed.
             return None
 
-        input_batch = self.execute_model_state["input_batch"]
-        hidden_states = self.execute_model_state["hidden_states"]
-        kv_connector_output = self.execute_model_state["kv_connector_output"]
+        input_batch = self.execute_model_state.input_batch
+        hidden_states = self.execute_model_state.hidden_states
+        kv_connector_output = self.execute_model_state.kv_connector_output
         self.execute_model_state = None
 
         if not self.is_last_pp_rank:
@@ -1154,3 +1154,13 @@ def postprocess_pool(self, input_batch: InputBatch) -> None:
         np.minimum(
             computed_prefill, self.req_states.prefill_len.np, out=computed_prefill
         )
+
+
+class ExecuteModelState(NamedTuple):
+    input_batch: InputBatch
+    attn_metadata: dict[str, Any] | None
+    slot_mappings_by_layer: dict[str, torch.Tensor] | None
+    hidden_states: torch.Tensor | IntermediateTensors
+    aux_hidden_states: list[torch.Tensor] | None
+    kv_connector_output: KVConnectorOutput | None
+    num_tokens_across_dp: torch.Tensor | None