Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,15 @@ def decode(self, codes: torch.Tensor) -> torch.Tensor:
if not self.enabled or not self._warmed_up or codes.shape[0] != 1:
return self.decoder(codes)

# Inner CUDA graph replay is illegal while an outer stream capture is
# active (e.g. vLLM's cudagraph_mode=FULL warmup on Stage 1). Fall back
# to eager in that case so the outer capture can complete. The guard is
# a no-op at runtime: is_current_stream_capturing() returns False
# outside the startup capture window, so normal inference still hits
# the graph fast path.
if torch.cuda.is_current_stream_capturing():
return self.decoder(codes)

actual_size = codes.shape[-1]
padded_size = self._get_padded_size(actual_size)

Expand Down
Loading