From 143b2f32a8564e161727a3e22d02570878b58eb4 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 9 Dec 2025 00:12:33 -0800 Subject: [PATCH 1/3] fix bug Signed-off-by: Chen Zhang --- examples/offline_inference/spec_decode.py | 2 ++ vllm/v1/engine/core_client.py | 1 + 2 files changed, 3 insertions(+) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 29b2e95d262f..7458140c02f8 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -71,6 +71,7 @@ def parse_args(): parser.add_argument("--model-dir", type=str, default=None) parser.add_argument("--eagle-dir", type=str, default=None) parser.add_argument("--custom-mm-prompts", action="store_true") + parser.add_argument("--common-prefix-len", type=int, default=0) return parser.parse_args() @@ -215,6 +216,7 @@ def main(args): assert args.top_p == 1.0 assert args.top_k == -1 assert args.enable_chunked_prefill + assert args.common_prefix_len == 0 # check acceptance length is within 2% of expected value rtol = 0.02 diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index c936646aa799..770d1360ea86 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -269,6 +269,7 @@ def __init__(self, *args, **kwargs): def get_output(self) -> EngineCoreOutputs: outputs, _ = self.engine_core.step_fn() + self.engine_core.post_step(model_executed=True) return outputs and outputs.get(0) or EngineCoreOutputs() def get_supported_tasks(self) -> tuple[SupportedTask, ...]: From cb9eb63e8da80efc50f2437ec3a9e31c89d5a210 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 9 Dec 2025 00:24:15 -0800 Subject: [PATCH 2/3] revert Signed-off-by: Chen Zhang --- examples/offline_inference/spec_decode.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/offline_inference/spec_decode.py b/examples/offline_inference/spec_decode.py index 7458140c02f8..29b2e95d262f 100644 --- a/examples/offline_inference/spec_decode.py +++ b/examples/offline_inference/spec_decode.py @@ -71,7 +71,6 @@ def parse_args(): parser.add_argument("--model-dir", type=str, default=None) parser.add_argument("--eagle-dir", type=str, default=None) parser.add_argument("--custom-mm-prompts", action="store_true") - parser.add_argument("--common-prefix-len", type=int, default=0) return parser.parse_args() @@ -216,7 +215,6 @@ def main(args): assert args.top_p == 1.0 assert args.top_k == -1 assert args.enable_chunked_prefill - assert args.common_prefix_len == 0 # check acceptance length is within 2% of expected value rtol = 0.02 From 5c726887eb59214ca489f2a4222d085f16dbb51d Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 9 Dec 2025 00:36:49 -0800 Subject: [PATCH 3/3] fix Signed-off-by: Chen Zhang --- vllm/v1/engine/core_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 770d1360ea86..807db8275fbf 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -268,8 +268,8 @@ def __init__(self, *args, **kwargs): self.engine_core = EngineCore(*args, **kwargs) def get_output(self) -> EngineCoreOutputs: - outputs, _ = self.engine_core.step_fn() - self.engine_core.post_step(model_executed=True) + outputs, model_executed = self.engine_core.step_fn() + self.engine_core.post_step(model_executed=model_executed) return outputs and outputs.get(0) or EngineCoreOutputs() def get_supported_tasks(self) -> tuple[SupportedTask, ...]: