Query builder for cudagraph support instead of per-layer metadata and hardcoded error

ProExpertProg · ProExpertProg · commit ab519def8ff5 · 2025-06-12T18:55:23.000Z
Signed-off-by: luka &lt;luka@neuralmagic.com&gt;
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Attention layer with FlashAttention."""
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Any, ClassVar, Optional
 
 import numpy as np
 import torch
@@ -127,10 +127,6 @@ class LocalAttentionMetadata:
 
     local_attn_metadata: Optional[LocalAttentionMetadata] = None
 
-    # Supported for prefill and decode.
-    # Backend (FA2 vs FA3 vs Triton) support checked separately.
-    cuda_graph_supported: bool = True
-
 
 #
 # Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into
@@ -311,6 +307,7 @@ def _get_sliding_window_configs(
 
 class FlashAttentionMetadataBuilder(
         AttentionMetadataBuilder[FlashAttentionMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = get_flash_attn_version() == 3
 
     def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
@@ -501,6 +498,11 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
         )
         return attn_metadata
 
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        # Full CUDA Graph always supported (FA2 support checked separately)
+        return True
+
     def use_cascade_attention(self, *args, **kwargs) -> bool:
         return use_cascade_attention(*args, **kwargs)
 
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -326,13 +326,6 @@ def __post_init__(self):
                 f"Only {supported_head_sizes} are supported for head_dim,",
                 f"received {self.head_dim}.")
 
-    @property
-    def cuda_graph_supported(self):
-        """
-        Full CUDA Graphs (including attention) only supported for pure decode.
-        """
-        return self.num_prefills == 0
-
 
 M = TypeVar("M", bound=MLACommonMetadata)
 
@@ -598,6 +591,10 @@ def build(self, common_prefix_len: int,
             decode=decode_metadata,
         )
 
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        return common_attn_metadata.max_query_len == 1
+
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any, ClassVar, Optional
 
 import torch
 
@@ -54,6 +54,7 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 
 
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
+    full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
 
     def __init__(self, runner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -3,7 +3,7 @@
 import abc
 from abc import abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Generic, TypeVar
+from typing import TYPE_CHECKING, ClassVar, Generic, TypeVar
 
 import numpy as np
 import torch
@@ -38,6 +38,8 @@ class CommonAttentionMetadata:
 
 
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
+    # Does this backend/builder support CUDA Graphs for attention.
+    full_cudagraph_supported: ClassVar[bool] = False
 
     @abstractmethod
     def build(self, common_prefix_len: int,
@@ -48,6 +50,13 @@ def build(self, common_prefix_len: int,
         """
         raise NotImplementedError
 
+    def can_run_in_cudagraph(
+            self, common_attn_metadata: CommonAttentionMetadata) -> bool:
+        """
+        Can this batch (with given metadata) use CUDA Graphs for attention.
+        """
+        return False
+
     def build_for_cudagraph_capture(
             self, common_attn_metadata: CommonAttentionMetadata) -> M:
         """
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -18,7 +18,6 @@
 from vllm.attention import AttentionType, get_attn_backend
 from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
-from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import (CompilationLevel, VllmConfig,
                          get_layers_from_vllm_config)
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
@@ -555,7 +554,15 @@ def _get_cumsum_and_arange(
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]:
+    ) -> tuple[dict[str, Any], bool, torch.Tensor,
+               Optional[SpecDecodeMetadata]]:
+        """
+        :return: tuple[
+        attn_metadata: layer-to-attention_metadata mapping,
+        attention_cuda_graphs: whether attention can run in captured cudagraph
+        logits_indices, spec_decode_metadata
+        ]
+        """
         total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
@@ -677,27 +684,31 @@ def _prepare_inputs(
         )
 
         attn_metadata: dict[str, Any] = {}
+        attention_cuda_graphs = []
         # Prepare the attention metadata for each KV cache group and make layers
         # in the same group share the same metadata.
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
 
             # Prepare for cascade attention if enabled & beneficial.
             common_prefix_len = 0
+            builder = self.attn_metadata_builders[kv_cache_group_id]
             if self.cascade_attn_enabled:
                 common_prefix_len = self._compute_cascade_attn_prefix_len(
                     num_scheduled_tokens,
                     scheduler_output.
                     num_common_prefix_blocks[kv_cache_group_id],
                     kv_cache_group_spec.kv_cache_spec,
-                    self.attn_metadata_builders[kv_cache_group_id],
+                    builder,
                 )
 
-            attn_metadata_i = (
-                self.attn_metadata_builders[kv_cache_group_id].build(
-                    common_prefix_len=common_prefix_len,
-                    common_attn_metadata=common_attn_metadata,
-                ))
+            attn_metadata_i = (builder.build(
+                common_prefix_len=common_prefix_len,
+                common_attn_metadata=common_attn_metadata,
+            ))
+            attention_cuda_graphs.append(
+                builder.can_run_in_cudagraph(common_attn_metadata))
+
             for layer_name in kv_cache_group_spec.layer_names:
                 attn_metadata[layer_name] = attn_metadata_i
 
@@ -729,7 +740,8 @@ def _prepare_inputs(
         if self.lora_config:
             self.set_active_loras(self.input_batch, num_scheduled_tokens)
 
-        return attn_metadata, logits_indices, spec_decode_metadata
+        return attn_metadata, all(
+            attention_cuda_graphs), logits_indices, spec_decode_metadata
 
     def _compute_cascade_attn_prefix_len(
         self,
@@ -1189,8 +1201,8 @@ def execute_model(
             return self.kv_connector_no_forward(scheduler_output)
 
         # Prepare the decoder inputs.
-        attn_metadata, logits_indices, spec_decode_metadata = (
-            self._prepare_inputs(scheduler_output))
+        (attn_metadata, attention_cuda_graphs, logits_indices,
+         spec_decode_metadata) = (self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -1255,11 +1267,9 @@ def execute_model(
             intermediate_tensors = self.sync_and_slice_intermediate_tensors(
                 num_input_tokens, intermediate_tensors, True)
 
-        # Some attention backends only support CUDA graphs in pure decode.
-        # Assume cuda_graph_supported is false if it does not exist.
-        attention_cuda_graphs = all(
-            getattr(m, "cuda_graph_supported", False)
-            for _, m in attn_metadata.items())
+        # Some attention backends only support CUDA Graphs in pure decode.
+        # If attention doesn't support CUDA Graphs for this batch, but we
+        # compiled with full CUDA graphs, we have to skip them entirely.
         skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
 
         # Run the decoder.
@@ -2100,20 +2110,20 @@ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
                     "Non-Attention backend is not supported by V1 "
                     "GPUModelRunner.")
 
-            if self.compilation_config.full_cuda_graph:
-                attn_backend_name = attn_backend_i.__name__
-                flash_attn_version = get_flash_attn_version()
-                if ((attn_backend_name != "FlashAttentionBackend"
-                     or flash_attn_version != 3)
-                        and attn_backend_name != "FlashMLABackend"):
-                    raise ValueError(
-                        f"Full CUDAGraph is only supported with FA3 or FlashMLA"
-                        f". Current attention backend is {attn_backend_name}, "
-                        f"FlashAttention version is {flash_attn_version}.")
-
             block_table_i = self.input_batch.block_table[i]
             attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-                weakref.proxy(self), kv_cache_spec, block_table_i)
+                weakref.proxy(self),
+                kv_cache_spec,
+                block_table_i,
+            )
+
+            if (self.full_cuda_graph
+                    and not attn_metadata_builder_i.full_cudagraph_supported):
+                raise ValueError(
+                    f"Full CUDAGraph not supported for "
+                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
+                    f"full_cuda_graph or use a different attention backend.")
+
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)