diff --git a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py index 96f8c799c13..0e1df2aa7db 100644 --- a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py +++ b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py @@ -140,6 +140,15 @@ def decode(self, codes: torch.Tensor) -> torch.Tensor: if not self.enabled or not self._warmed_up or codes.shape[0] != 1: return self.decoder(codes) + # Inner CUDA graph replay is illegal while an outer stream capture is + # active (e.g. vLLM's cudagraph_mode=FULL warmup on Stage 1). Fall back + # to eager in that case so the outer capture can complete. The guard is + # a no-op at runtime: is_current_stream_capturing() returns False + # outside the startup capture window, so normal inference still hits + # the graph fast path. + if torch.cuda.is_current_stream_capturing(): + return self.decoder(codes) + actual_size = codes.shape[-1] padded_size = self._get_padded_size(actual_size)