vllm-project · weijinqian0 · Dec 30, 2025 · Dec 22, 2025 · Dec 30, 2025
@@ -54,10 +54,10 @@ def wrapped(self):
 
         return wrapped
 
-    original_capture = NPUModelRunner._capture_model
+    original_capture = NPUModelRunner.capture_model
 
     with patch.object(NPUModelRunner,
-                      '_capture_model',
+                      'capture_model',
                       new=capture_model_wrapper(original_capture)):
         prompts = [
             "Hello, my name is", "The president of the United States is",
@@ -73,7 +73,7 @@ def wrapped(self):
             vllm_model = VllmRunner(snapshot_download(model))
         _ = vllm_model.generate(prompts, sampling_params)
 
-    assert capture_called.value == 1, "_capture_model was not called during test"
+    assert capture_called.value == 1, "capture_model was not called during test"
     assert capture_mem_before.value != -1, "capture_mem_before not set"
     assert capture_mem_after.value != -1, "capture_mem_after not set"
 
@@ -93,7 +93,7 @@ def wrapped(self):
     max_capture_mem_gib = baseline_capture_mem * capture_mem_tolerance
     max_mem_expected = max_capture_mem_gib * (1024**3)
     assert mem_used_by_capture < max_mem_expected, (
-        f"_capture_model used more memory than expected. "
+        f"capture_model used more memory than expected. "
         f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
         f"Expected: < {max_capture_mem_gib:.2f} GiB")
     os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
@@ -2,7 +2,6 @@
 from unittest.mock import MagicMock, patch
 
 import torch
-from vllm.v1.attention.backends.utils import AttentionCGSupport
 
 from tests.ut.base import TestBase
 from vllm_ascend.attention.attention_v1 import AscendAttentionState
@@ -98,7 +97,6 @@ def test_ascend_sfa_metadata_builder_default(self):
                                            vllm_config=vllm_config,
                                            device=device)
 
-        assert builder.aclgraph_support == AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
         assert builder.device == device
         assert builder.vllm_config == vllm_config
 

@@ -44,9 +44,6 @@
 
 
 class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
-    # Does this backend/builder support ACL Graphs for attention (default: no).
-    aclgraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.ALWAYS
     # AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
@@ -72,6 +69,16 @@ def __init__(
         self.dcp_rank = get_decode_context_model_parallel_rank(
         ) if self.dcp_size > 1 else 0
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AscendAttentionCPMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        # Explicit override in case the underlying builder specialized this getter.
+        # @override omitted only because of mypy limitation due to type variable.
+        return AttentionCGSupport.ALWAYS
+
     def _get_chunked_req_mask(self, local_context_lens_allranks) -> List[bool]:
         """
         given 4-d list [req][pcp][dcp], return:

@@ -182,9 +182,6 @@ class AscendMetadata:
 
 
 class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
-    # Does this backend/builder support ACL Graphs for attention (default: no).
-    aclgraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.ALWAYS
     # AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     # Does this backend/builder reorder the batch?
     # If not, set this to None. Otherwise set it to the query
@@ -220,6 +217,16 @@ def __init__(
         scheduler_config = vllm_config.scheduler_config
         self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AscendAttentionMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        # Explicit override in case the underlying builder specialized this getter.
+        # @override omitted only because of mypy limitation due to type variable.
+        return AttentionCGSupport.ALWAYS
+
     def reorder_batch(self, input_batch,
                       scheduler_output: "SchedulerOutput") -> bool:
         return False

@@ -1,4 +1,4 @@
-from typing import ClassVar, Optional, Tuple, TypeVar
+from typing import Optional, Tuple, TypeVar
 
 import numpy as np
 import torch
@@ -12,7 +12,7 @@
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.utils.math_utils import cdiv
 from vllm.v1.attention.backends.utils import AttentionCGSupport
-from vllm.v1.kv_cache_interface import MLAAttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec
 
 # isort: off
 from vllm_ascend.attention.mla_v1 import (AscendMLADecodeMetadata,
@@ -37,9 +37,6 @@
 
 
 class AscendMlaCPMetadataBuilder(AscendMLAMetadataBuilder):
-    # Does this backend/builder support ACL Graphs for attention (default: no).
-    aclgraph_support: ClassVar[AttentionCGSupport] = \
-    AttentionCGSupport.UNIFORM_BATCH
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -74,6 +71,16 @@ def __init__(
                                               dtype=torch.uint8,
                                               device=device)
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AscendMlaCPMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        # Explicit override in case the underlying builder specialized this getter.
+        # @override omitted only because of mypy limitation due to type variable.
+        return AttentionCGSupport.UNIFORM_BATCH
+
     def set_num_actual_tokens(
         self,
         common_attn_metadata: AscendCommonAttentionMetadata,

@@ -1,6 +1,5 @@
 from dataclasses import dataclass
-from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
-                    TypeVar)
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar
 
 import numpy as np
 import torch
@@ -15,7 +14,7 @@
 from vllm.utils.math_utils import cdiv, round_down
 from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
 from vllm.v1.attention.backends.utils import AttentionCGSupport
-from vllm.v1.kv_cache_interface import MLAAttentionSpec
+from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec
 
 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
@@ -182,9 +181,6 @@ def __post_init__(self):
 
 
 class AscendMLAMetadataBuilder(MLACommonMetadataBuilder[AscendMLAMetadata]):
-    # Does this backend/builder support ACL Graphs for attention (default: no).
-    aclgraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_BATCH
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -263,6 +259,16 @@ def __init__(
         self.query_lens: torch.Tensor = None
         self.seq_lens: torch.Tensor = None
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AscendMLAMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        # Explicit override in case the underlying builder specialized this getter.
+        # @override omitted only because of mypy limitation due to type variable.
+        return AttentionCGSupport.UNIFORM_BATCH
+
     def reorder_batch(self, input_batch: "NPUInputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are at

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, ClassVar, Optional, Tuple, Type, TypeVar
+from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar
 
 import torch
 import torch_npu
@@ -15,6 +15,7 @@
 from vllm.triton_utils import HAS_TRITON
 from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
 from vllm.v1.attention.backends.utils import AttentionCGSupport
+from vllm.v1.kv_cache_interface import AttentionSpec
 
 from vllm_ascend import envs
 from vllm_ascend.ascend_config import get_ascend_config
@@ -113,9 +114,6 @@ class AscendSFAMetadata:
 
 
 class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
-    # Does this backend/builder support ACL Graphs for attention (default: no).
-    aclgraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     """
     NOTE: Please read the comment at the top of the file before trying to
     understand this class
@@ -159,6 +157,16 @@ def __init__(
             == CUDAGraphMode.FULL_DECODE_ONLY
         ), "FlashComm1 is not compatible with FULL_DECODE_ONLY. Please set graph_mode to 'piecewise' or disable FlashComm1."
 
+    @classmethod
+    def get_cudagraph_support(
+        cls: type["AscendSFAMetadataBuilder"],
+        vllm_config: VllmConfig,
+        kv_cache_spec: AttentionSpec,
+    ) -> AttentionCGSupport:
+        # Explicit override in case the underlying builder specialized this getter.
+        # @override omitted only because of mypy limitation due to type variable.
+        return AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+
     def reorder_batch(self, input_batch: "NPUInputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # No need to reorder for Ascend SFA

@@ -26,6 +26,8 @@
 
 # todo: please remove it when solve cuda hard code in vllm
 os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
+# todo: please remove it when support controls garbage collection during CUDA graph capture.
+os.environ["VLLM_ENABLE_CUDAGRAPH_GC"] = "1"
 
 from vllm_ascend.ascend_config import init_ascend_config
 from vllm_ascend.utils import refresh_block_size
@@ -244,6 +246,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 data_parallel_size,
             )
             compilation_config.use_inductor = False
+            # NOTE: Theoretically, we should also add vllm::mla_forward in the attention ops.
+            # Since the process is created in the spawn mode, the value of the class attribute
+            # attention ops transmitted is still the one before modification, so it has not been modified.
+            # This will cause in scenarios where both piecewise and splitting ops are configured simultaneously,
+            # If splitting ops does not contain the vllm::mla forward value, this configuration issue will
+            # not be detected in advance assert.
             compilation_config.splitting_ops.extend(["vllm::mla_forward"])
             update_aclgraph_sizes(vllm_config)
             ascend_config.enable_npugraph_ex = False