From ae35220fa2faedd9b7ad1e57842ad0de06e09172 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 4 Mar 2026 00:01:29 +0000 Subject: [PATCH 1/3] [Model Runner V2] Use dictionary instead of tuple for execute_model_state Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 62 ++++++++++++++---------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 9267e187415f..a73e203cda38 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -217,7 +217,7 @@ def __init__( self.pooling_runner: PoolingRunner | None = None # For transferring state from execute_model to subsequent sample_tokens call. - self.execute_model_state: tuple | None = None + self.execute_model_state: dict | None = None def update_max_model_len(self, max_model_len: int) -> None: self.max_model_len = max_model_len @@ -379,16 +379,12 @@ def _dummy_run( return None, None assert self.execute_model_state is not None - ( - input_batch, - model_inputs, - attn_metadata, - slot_mappings_by_layer, - hidden_states, - aux_hidden_states, - kv_connector_output, - num_tokens_across_dp, - ) = self.execute_model_state + input_batch = self.execute_model_state["input_batch"] + attn_metadata = self.execute_model_state["attn_metadata"] + slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"] + hidden_states = self.execute_model_state["hidden_states"] + aux_hidden_states = self.execute_model_state["aux_hidden_states"] + num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"] self.execute_model_state = None # dummy run the eagle speculator's propose to ensure DP/EP sync. @@ -990,16 +986,16 @@ def execute_model( aux_hidden_states = None kv_connector_output = self.kv_connector.post_forward(scheduler_output) - self.execute_model_state = ( - input_batch, - model_inputs, - attn_metadata, - slot_mappings_by_layer, - hidden_states, - aux_hidden_states, - kv_connector_output, - num_tokens_across_dp, - ) + self.execute_model_state = { + "input_batch": input_batch, + "model_inputs": model_inputs, + "attn_metadata": attn_metadata, + "slot_mappings_by_layer": slot_mappings_by_layer, + "hidden_states": hidden_states, + "aux_hidden_states": aux_hidden_states, + "kv_connector_output": kv_connector_output, + "num_tokens_across_dp": num_tokens_across_dp, + } if not self.is_last_pp_rank: # Non-last PP rank: return IntermediateTensors for sending. @@ -1017,16 +1013,14 @@ def sample_tokens( if self.execute_model_state is None: # The prior execute_model call must have failed. return None - ( - input_batch, - model_inputs, - attn_metadata, - slot_mappings_by_layer, - hidden_states, - aux_hidden_states, - kv_connector_output, - num_tokens_across_dp, - ) = self.execute_model_state + + input_batch = self.execute_model_state["input_batch"] + attn_metadata = self.execute_model_state["attn_metadata"] + slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"] + hidden_states = self.execute_model_state["hidden_states"] + aux_hidden_states = self.execute_model_state["aux_hidden_states"] + kv_connector_output = self.execute_model_state["kv_connector_output"] + num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"] self.execute_model_state = None if not self.is_last_pp_rank: @@ -1117,9 +1111,9 @@ def pool(self) -> AsyncPoolingOutput | ModelRunnerOutput | None: # The prior execute_model call must have failed. return None - input_batch, _, _, _, hidden_states, _, kv_connector_output = ( - self.execute_model_state - ) + input_batch = self.execute_model_state["input_batch"] + hidden_states = self.execute_model_state["hidden_states"] + kv_connector_output = self.execute_model_state["kv_connector_output"] self.execute_model_state = None if not self.is_last_pp_rank: From 81d1bffc56d7de2767f91790d013e618608afb86 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Mar 2026 16:43:54 +0000 Subject: [PATCH 2/3] [Model Runner V2] Add dummy profile_cudagraph_memory API Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 8cb65c4d2647..c26fe9d6793d 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -473,6 +473,10 @@ def _get_num_input_tokens(self, num_scheduled_tokens: int) -> int: # SP is not supported yet. return num_scheduled_tokens + def profile_cudagraph_memory(self) -> int: + # NOTE(woosuk): It is TBD whether we keep this API or not. + return 0 + @torch.inference_mode() def capture_model(self) -> int: if not self.cudagraph_manager.needs_capture(): From aba9c8526f79d9cee07e92dfad34e12a40b17268 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 9 Mar 2026 16:48:55 +0000 Subject: [PATCH 3/3] NamedTuple Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 66 +++++++++++++++++------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index b136f53801a9..b373b873aaf3 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -21,6 +21,7 @@ import gc import time from copy import deepcopy +from typing import Any, NamedTuple import numpy as np import torch @@ -44,7 +45,7 @@ from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.kv_cache_interface import KVCacheConfig -from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput +from vllm.v1.outputs import DraftTokenIds, KVConnectorOutput, ModelRunnerOutput from vllm.v1.worker.cp_utils import check_attention_cp_compatibility from vllm.v1.worker.gpu.async_utils import AsyncOutput, AsyncPoolingOutput from vllm.v1.worker.gpu.attn_utils import ( @@ -213,7 +214,7 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): self.pooling_runner: PoolingRunner | None = None # For transferring state from execute_model to subsequent sample_tokens call. - self.execute_model_state: dict | None = None + self.execute_model_state: ExecuteModelState | None = None def update_max_model_len(self, max_model_len: int) -> None: self.max_model_len = max_model_len @@ -375,12 +376,12 @@ def _dummy_run( return None, None assert self.execute_model_state is not None - input_batch = self.execute_model_state["input_batch"] - attn_metadata = self.execute_model_state["attn_metadata"] - slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"] - hidden_states = self.execute_model_state["hidden_states"] - aux_hidden_states = self.execute_model_state["aux_hidden_states"] - num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"] + input_batch = self.execute_model_state.input_batch + attn_metadata = self.execute_model_state.attn_metadata + slot_mappings_by_layer = self.execute_model_state.slot_mappings_by_layer + hidden_states = self.execute_model_state.hidden_states + aux_hidden_states = self.execute_model_state.aux_hidden_states + num_tokens_across_dp = self.execute_model_state.num_tokens_across_dp self.execute_model_state = None # dummy run the eagle speculator's propose to ensure DP/EP sync. @@ -981,16 +982,15 @@ def execute_model( aux_hidden_states = None kv_connector_output = self.kv_connector.post_forward(scheduler_output) - self.execute_model_state = { - "input_batch": input_batch, - "model_inputs": model_inputs, - "attn_metadata": attn_metadata, - "slot_mappings_by_layer": slot_mappings_by_layer, - "hidden_states": hidden_states, - "aux_hidden_states": aux_hidden_states, - "kv_connector_output": kv_connector_output, - "num_tokens_across_dp": num_tokens_across_dp, - } + self.execute_model_state = ExecuteModelState( + input_batch=input_batch, + attn_metadata=attn_metadata, + slot_mappings_by_layer=slot_mappings_by_layer, + hidden_states=hidden_states, + aux_hidden_states=aux_hidden_states, + kv_connector_output=kv_connector_output, + num_tokens_across_dp=num_tokens_across_dp, + ) if not self.is_last_pp_rank: # Non-last PP rank: return IntermediateTensors for sending. @@ -1009,13 +1009,13 @@ def sample_tokens( # The prior execute_model call must have failed. return None - input_batch = self.execute_model_state["input_batch"] - attn_metadata = self.execute_model_state["attn_metadata"] - slot_mappings_by_layer = self.execute_model_state["slot_mappings_by_layer"] - hidden_states = self.execute_model_state["hidden_states"] - aux_hidden_states = self.execute_model_state["aux_hidden_states"] - kv_connector_output = self.execute_model_state["kv_connector_output"] - num_tokens_across_dp = self.execute_model_state["num_tokens_across_dp"] + input_batch = self.execute_model_state.input_batch + attn_metadata = self.execute_model_state.attn_metadata + slot_mappings_by_layer = self.execute_model_state.slot_mappings_by_layer + hidden_states = self.execute_model_state.hidden_states + aux_hidden_states = self.execute_model_state.aux_hidden_states + kv_connector_output = self.execute_model_state.kv_connector_output + num_tokens_across_dp = self.execute_model_state.num_tokens_across_dp self.execute_model_state = None if not self.is_last_pp_rank: @@ -1106,9 +1106,9 @@ def pool(self) -> AsyncPoolingOutput | ModelRunnerOutput | None: # The prior execute_model call must have failed. return None - input_batch = self.execute_model_state["input_batch"] - hidden_states = self.execute_model_state["hidden_states"] - kv_connector_output = self.execute_model_state["kv_connector_output"] + input_batch = self.execute_model_state.input_batch + hidden_states = self.execute_model_state.hidden_states + kv_connector_output = self.execute_model_state.kv_connector_output self.execute_model_state = None if not self.is_last_pp_rank: @@ -1154,3 +1154,13 @@ def postprocess_pool(self, input_batch: InputBatch) -> None: np.minimum( computed_prefill, self.req_states.prefill_len.np, out=computed_prefill ) + + +class ExecuteModelState(NamedTuple): + input_batch: InputBatch + attn_metadata: dict[str, Any] | None + slot_mappings_by_layer: dict[str, torch.Tensor] | None + hidden_states: torch.Tensor | IntermediateTensors + aux_hidden_states: list[torch.Tensor] | None + kv_connector_output: KVConnectorOutput | None + num_tokens_across_dp: torch.Tensor | None