diff --git a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py
index 96f8c799c13..0e1df2aa7db 100644
--- a/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py
+++ b/vllm_omni/model_executor/models/qwen3_tts/cuda_graph_decoder_wrapper.py
@@ -140,6 +140,15 @@ def decode(self, codes: torch.Tensor) -> torch.Tensor:
         if not self.enabled or not self._warmed_up or codes.shape[0] != 1:
             return self.decoder(codes)
 
+        # Inner CUDA graph replay is illegal while an outer stream capture is
+        # active (e.g. vLLM's cudagraph_mode=FULL warmup on Stage 1). Fall back
+        # to eager in that case so the outer capture can complete. The guard is
+        # a no-op at runtime: is_current_stream_capturing() returns False
+        # outside the startup capture window, so normal inference still hits
+        # the graph fast path.
+        if torch.cuda.is_current_stream_capturing():
+            return self.decoder(codes)
+
         actual_size = codes.shape[-1]
         padded_size = self._get_padded_size(actual_size)