From 7ea7fb645db93dc379407d190b149557e86ed30e Mon Sep 17 00:00:00 2001 From: Yueqian Lin Date: Mon, 13 Apr 2026 12:38:52 -0400 Subject: [PATCH] fix(runner): pass request_id to model.preprocess() for per-request state OmniGPUModelRunner._preprocess() calls model.preprocess() per request but never passes the request_id. Models that maintain per-request state (e.g. VoxCPM2TalkerForConditionalGeneration) fall back to a hardcoded "default" id, causing all concurrent requests to share a single state. This produces two bugs in batched inference: - Stop logic failure: shared state mixes stop signals across requests, so requests never terminate (58s audio for 4s sentences) - Prefill shape mismatch: second preprocess() overwrites first's masks, causing RuntimeError when forward() reads stale dimensions Fix: inject req_id into req_infos before the preprocess() call. Tested on H20 (single GPU, enforce_eager): - 2 concurrent requests: audio duration 2.72s + 5.28s (was 57s + 58s) - Single request: unchanged (RTF ~0.21) Signed-off-by: Yueqian Lin --- vllm_omni/worker/gpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py index 1f678b579fa..5ff62c11b40 100644 --- a/vllm_omni/worker/gpu_model_runner.py +++ b/vllm_omni/worker/gpu_model_runner.py @@ -1241,6 +1241,7 @@ def _preprocess( span_len = int(e) - int(s) # call the custom process function + req_infos["request_id"] = req_id embed_slice = inputs_embeds[s:e] if inputs_embeds is not None else None req_input_ids, req_embeds, update_dict = self.model.preprocess( input_ids=input_ids[s:e], input_embeds=embed_slice, **req_infos