From 73285db5f8e68ba8b9c4ff32e99382c0243b9e11 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Thu, 30 Oct 2025 14:14:00 -0700 Subject: [PATCH] Remove max cudagraph size limit of 992 Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- vllm/v1/attention/backends/flash_attn.py | 7 ------- vllm/v1/attention/backends/mla/flashattn_mla.py | 7 ------- 2 files changed, 14 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 1eac94940e78..45835caa9e70 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -239,13 +239,6 @@ def __init__( self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.aot_schedule: - if self.max_cudagraph_size > 992: - # This condition derives from FA3's internal heuristic. - # TODO(woosuk): Support larger cudagraph sizes. - raise ValueError( - "Capture size larger than 992 is not supported for full cuda graph." - ) - self.scheduler_metadata = torch.zeros( vllm_config.scheduler_config.max_num_seqs + 1, dtype=torch.int32, diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index a6aac701b784..1a7033be9044 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -92,13 +92,6 @@ def __init__( self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.fa_aot_schedule: - if self.max_cudagraph_size > 992: - # This condition derives from FA3's internal heuristic. - # TODO(woosuk): Support larger cudagraph sizes. - raise ValueError( - "Capture size larger than 992 is not supported for full cuda graph." - ) - self.scheduler_metadata = torch.zeros( vllm_config.scheduler_config.max_num_seqs + 1, dtype=torch.int32,