Superclass for attention metadata builder.

ProExpertProg · ProExpertProg · commit a63e6d686f35 · 2025-06-12T18:55:23.000Z
Signed-off-by: luka &lt;luka@neuralmagic.com&gt;
diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py
@@ -7,7 +7,8 @@
                                                 TorchSDPAMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.attention.ops.ipex_attn import PagedAttention
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
@@ -53,7 +54,7 @@ def use_cascade_attention(*args, **kwargs) -> bool:
         return False
 
 
-class TorchSDPAMetadataBuilderV1:
+class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]):
 
     def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable) -> None:
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -21,13 +21,12 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 if current_platform.is_cuda():
@@ -310,7 +309,8 @@ def _get_sliding_window_configs(
     return sliding_window_configs
 
 
-class FlashAttentionMetadataBuilder:
+class FlashAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlashAttentionMetadata]):
 
     def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
@@ -340,17 +340,6 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return False
-
-    def build_for_cudagraph_capture(
-        self, num_reqs: int, num_tokens: int,
-        common_attn_metadata: CommonAttentionMetadata
-    ) -> FlashAttentionMetadata:
-        return self.build(num_reqs, num_tokens, num_tokens, 0,
-                          common_attn_metadata)
-
     def build(
         self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         common_prefix_len: int, common_attn_metadata: CommonAttentionMetadata
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -18,7 +18,8 @@
 from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
@@ -202,7 +203,7 @@ def __post_init__(self):
                 f" received {self.head_dim}.")
 
 
-class FlashInferMetadataBuilder:
+class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
 
     def __init__(self, runner: GPUModelRunner, kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py
@@ -15,7 +15,8 @@
                                               is_quantized_kv_cache)
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
@@ -25,8 +26,6 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 create_block_mask_compiled = torch.compile(create_block_mask,
@@ -256,7 +255,8 @@ def __post_init__(self):
         self.block_mask = self.build_block_mask()
 
 
-class FlexAttentionMetadataBuilder:
+class FlexAttentionMetadataBuilder(
+        AttentionMetadataBuilder[FlexAttentionMetadata]):
 
     def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
                  block_table: BlockTable):
@@ -272,10 +272,6 @@ def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
         self.kv_cache_spec = kv_cache_spec
         self.block_table = block_table
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return False
-
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata):
@@ -332,9 +328,6 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
         )
         return out
 
-    def use_cascade_attention(self, *args, **kwargs) -> bool:
-        return False
-
 
 class FlexAttentionImpl(AttentionImpl):
     sliding_window: Optional[tuple[int, int]]
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -207,7 +207,8 @@
                                                UnquantizedLinearMethod)
 from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
@@ -336,7 +337,7 @@ def cuda_graph_supported(self):
 M = TypeVar("M", bound=MLACommonMetadata)
 
 
-class MLACommonMetadataBuilder(Generic[M]):
+class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -467,6 +468,8 @@ def build_for_cudagraph_capture(
         assert num_reqs == num_tokens, \
             "MLA only supports decode-only full CUDAGraph capture. " \
             "Make sure all cudagraph capture sizes <= max_num_seq."
+
+        # Update state usually set in reorder_batch.
         self._num_decodes = num_tokens
         self._num_decode_tokens = num_tokens
         self._num_prefills = 0
@@ -590,9 +593,6 @@ def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
             decode=decode_metadata,
         )
 
-    def use_cascade_attention(self, *args, **kwargs) -> bool:
-        return False
-
 
 class MLACommonImpl(MLAAttentionImpl[M], Generic[M]):
     """
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -1,9 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import abc
+from abc import abstractmethod
 from dataclasses import dataclass
+from typing import TYPE_CHECKING, Generic, TypeVar
 
+import numpy as np
 import torch
 
+if TYPE_CHECKING:
+    from vllm.v1.core.sched.output import SchedulerOutput
+    from vllm.v1.worker.gpu_input_batch import InputBatch
+
 
 @dataclass
 class CommonAttentionMetadata:
@@ -19,6 +27,52 @@ class CommonAttentionMetadata:
     and newly scheduled tokens"""
 
 
+M = TypeVar("M")
+
+
+class AttentionMetadataBuilder(abc.ABC, Generic[M]):
+
+    @abstractmethod
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata) -> M:
+        """
+        Central method that builds attention metadata.
+        Some builders (MLA) require reorder_batch to be called prior to build.
+        """
+        raise NotImplementedError
+
+    def build_for_cudagraph_capture(
+            self, num_reqs: int, num_tokens: int,
+            common_attn_metadata: CommonAttentionMetadata) -> M:
+        """
+        Build attention metadata for CUDA graph capture. Uses build by default.
+        Subclasses that override this method should call self.build.
+        """
+        return self.build(num_reqs, num_tokens, num_tokens, 0,
+                          common_attn_metadata)
+
+    def use_cascade_attention(
+        self,
+        common_prefix_len: int,
+        query_lens: np.ndarray,
+        num_query_heads: int,
+        num_kv_heads: int,
+        use_alibi: bool,
+        use_sliding_window: bool,
+        num_sms: int,
+    ) -> bool:
+        return False
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        """
+        This method can reorder the batch if desired by the backend.
+        :return: Has the batch been reordered (default False).
+        """
+        return False
+
+
 def validate_kv_sharing_target(current_layer_name, target_layer_name,
                                static_forward_context):
     error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -16,8 +16,7 @@
 
 import vllm.envs as envs
 from vllm.attention import AttentionType, get_attn_backend
-from vllm.attention.backends.abstract import (AttentionBackend,
-                                              AttentionMetadataBuilder)
+from vllm.attention.backends.abstract import AttentionBackend
 from vllm.attention.layer import Attention
 from vllm.attention.utils.fa_utils import get_flash_attn_version
 from vllm.config import (CompilationLevel, VllmConfig,
@@ -41,7 +40,8 @@
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
                         GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
                         check_use_alibi, is_pin_memory_available)
-from vllm.v1.attention.backends.utils import CommonAttentionMetadata
+from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
+                                              CommonAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
 from vllm.v1.kv_cache_interface import (AttentionSpec, FullAttentionSpec,
                                         KVCacheConfig, KVCacheSpec,