diff --git a/vllm_omni/benchmarks/patch/patch.py b/vllm_omni/benchmarks/patch/patch.py index 4e13dfaced2..539af11f868 100644 --- a/vllm_omni/benchmarks/patch/patch.py +++ b/vllm_omni/benchmarks/patch/patch.py @@ -376,9 +376,8 @@ async def benchmark( limit_per_host=max_concurrency or 0, ttl_dns_cache=300, use_dns_cache=True, - keepalive_timeout=60, enable_cleanup_closed=True, - force_close=False, + force_close=True, ssl=ssl_setting, ) diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py index 2b5bcd60a03..5a22ce024ab 100644 --- a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py +++ b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py @@ -1047,9 +1047,7 @@ def _get_talker_assistant_parts( dim=0, ) else: - trailing_text_hidden = torch.zeros( - tts_eos_embed.shape, device=tts_eos_embed.device, dtype=tts_eos_embed.dtype - ) + trailing_text_hidden = tts_eos_embed input_embeds = assistant_text_hidden + assistant_codec_hidden input_ids = torch.full(