Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions tests/e2e/singlecard/test_aclgraph_mem.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def wrapped(self):

return wrapped

original_capture = NPUModelRunner._capture_model
original_capture = NPUModelRunner.capture_model

with patch.object(NPUModelRunner,
'_capture_model',
'capture_model',
new=capture_model_wrapper(original_capture)):
prompts = [
"Hello, my name is", "The president of the United States is",
Expand All @@ -73,7 +73,7 @@ def wrapped(self):
vllm_model = VllmRunner(snapshot_download(model))
_ = vllm_model.generate(prompts, sampling_params)

assert capture_called.value == 1, "_capture_model was not called during test"
assert capture_called.value == 1, "capture_model was not called during test"
assert capture_mem_before.value != -1, "capture_mem_before not set"
assert capture_mem_after.value != -1, "capture_mem_after not set"

Expand All @@ -93,7 +93,7 @@ def wrapped(self):
max_capture_mem_gib = baseline_capture_mem * capture_mem_tolerance
max_mem_expected = max_capture_mem_gib * (1024**3)
assert mem_used_by_capture < max_mem_expected, (
f"_capture_model used more memory than expected. "
f"capture_model used more memory than expected. "
f"Used: {mem_used_by_capture / (1024**3):.2f} GiB, "
f"Expected: < {max_capture_mem_gib:.2f} GiB")
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = 'spawn'
2 changes: 0 additions & 2 deletions tests/ut/attention/test_sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from unittest.mock import MagicMock, patch

import torch
from vllm.v1.attention.backends.utils import AttentionCGSupport

from tests.ut.base import TestBase
from vllm_ascend.attention.attention_v1 import AscendAttentionState
Expand Down Expand Up @@ -98,7 +97,6 @@ def test_ascend_sfa_metadata_builder_default(self):
vllm_config=vllm_config,
device=device)

assert builder.aclgraph_support == AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
assert builder.device == device
assert builder.vllm_config == vllm_config

Expand Down
13 changes: 10 additions & 3 deletions vllm_ascend/attention/attention_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,6 @@


class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.ALWAYS
# AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
# Does this backend/builder reorder the batch?
# If not, set this to None. Otherwise set it to the query
Expand All @@ -72,6 +69,16 @@ def __init__(
self.dcp_rank = get_decode_context_model_parallel_rank(
) if self.dcp_size > 1 else 0

@classmethod
def get_cudagraph_support(
cls: type["AscendAttentionCPMetadataBuilder"],
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
# Explicit override in case the underlying builder specialized this getter.
# @override omitted only because of mypy limitation due to type variable.
return AttentionCGSupport.ALWAYS

def _get_chunked_req_mask(self, local_context_lens_allranks) -> List[bool]:
"""
given 4-d list [req][pcp][dcp], return:
Expand Down
13 changes: 10 additions & 3 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,6 @@ class AscendMetadata:


class AscendAttentionMetadataBuilder(AttentionMetadataBuilder[AscendMetadata]):
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.ALWAYS
# AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
# Does this backend/builder reorder the batch?
# If not, set this to None. Otherwise set it to the query
Expand Down Expand Up @@ -220,6 +217,16 @@ def __init__(
scheduler_config = vllm_config.scheduler_config
self.chunked_prefill_enabled = scheduler_config.enable_chunked_prefill

@classmethod
def get_cudagraph_support(
cls: type["AscendAttentionMetadataBuilder"],
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
# Explicit override in case the underlying builder specialized this getter.
# @override omitted only because of mypy limitation due to type variable.
return AttentionCGSupport.ALWAYS

def reorder_batch(self, input_batch,
scheduler_output: "SchedulerOutput") -> bool:
return False
Expand Down
17 changes: 12 additions & 5 deletions vllm_ascend/attention/mla_cp.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import ClassVar, Optional, Tuple, TypeVar
from typing import Optional, Tuple, TypeVar

import numpy as np
import torch
Expand All @@ -12,7 +12,7 @@
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import MLAAttentionSpec
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

# isort: off
from vllm_ascend.attention.mla_v1 import (AscendMLADecodeMetadata,
Expand All @@ -37,9 +37,6 @@


class AscendMlaCPMetadataBuilder(AscendMLAMetadataBuilder):
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.UNIFORM_BATCH
"""
NOTE: Please read the comment at the top of the file before trying to
understand this class
Expand Down Expand Up @@ -74,6 +71,16 @@ def __init__(
dtype=torch.uint8,
device=device)

@classmethod
def get_cudagraph_support(
cls: type["AscendMlaCPMetadataBuilder"],
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
# Explicit override in case the underlying builder specialized this getter.
# @override omitted only because of mypy limitation due to type variable.
return AttentionCGSupport.UNIFORM_BATCH

def set_num_actual_tokens(
self,
common_attn_metadata: AscendCommonAttentionMetadata,
Expand Down
18 changes: 12 additions & 6 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from dataclasses import dataclass
from typing import (TYPE_CHECKING, ClassVar, NamedTuple, Optional, Tuple, Type,
TypeVar)
from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Type, TypeVar

import numpy as np
import torch
Expand All @@ -15,7 +14,7 @@
from vllm.utils.math_utils import cdiv, round_down
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import MLAAttentionSpec
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

from vllm_ascend import envs
from vllm_ascend.ascend_config import get_ascend_config
Expand Down Expand Up @@ -182,9 +181,6 @@ def __post_init__(self):


class AscendMLAMetadataBuilder(MLACommonMetadataBuilder[AscendMLAMetadata]):
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.UNIFORM_BATCH
"""
NOTE: Please read the comment at the top of the file before trying to
understand this class
Expand Down Expand Up @@ -263,6 +259,16 @@ def __init__(
self.query_lens: torch.Tensor = None
self.seq_lens: torch.Tensor = None

@classmethod
def get_cudagraph_support(
cls: type["AscendMLAMetadataBuilder"],
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
# Explicit override in case the underlying builder specialized this getter.
# @override omitted only because of mypy limitation due to type variable.
return AttentionCGSupport.UNIFORM_BATCH

def reorder_batch(self, input_batch: "NPUInputBatch",
scheduler_output: "SchedulerOutput") -> bool:
# We now want to reorder the batch so that the "decode" requests are at
Expand Down
16 changes: 12 additions & 4 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, ClassVar, Optional, Tuple, Type, TypeVar
from typing import TYPE_CHECKING, Optional, Tuple, Type, TypeVar

import torch
import torch_npu
Expand All @@ -15,6 +15,7 @@
from vllm.triton_utils import HAS_TRITON
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec

from vllm_ascend import envs
from vllm_ascend.ascend_config import get_ascend_config
Expand Down Expand Up @@ -113,9 +114,6 @@ class AscendSFAMetadata:


class AscendSFAMetadataBuilder(MLACommonMetadataBuilder[AscendSFAMetadata]):
# Does this backend/builder support ACL Graphs for attention (default: no).
aclgraph_support: ClassVar[AttentionCGSupport] = \
AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
"""
NOTE: Please read the comment at the top of the file before trying to
understand this class
Expand Down Expand Up @@ -159,6 +157,16 @@ def __init__(
== CUDAGraphMode.FULL_DECODE_ONLY
), "FlashComm1 is not compatible with FULL_DECODE_ONLY. Please set graph_mode to 'piecewise' or disable FlashComm1."

@classmethod
def get_cudagraph_support(
cls: type["AscendSFAMetadataBuilder"],
vllm_config: VllmConfig,
kv_cache_spec: AttentionSpec,
) -> AttentionCGSupport:
# Explicit override in case the underlying builder specialized this getter.
# @override omitted only because of mypy limitation due to type variable.
return AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE

def reorder_batch(self, input_batch: "NPUInputBatch",
scheduler_output: "SchedulerOutput") -> bool:
# No need to reorder for Ascend SFA
Expand Down
8 changes: 8 additions & 0 deletions vllm_ascend/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

# todo: please remove it when solve cuda hard code in vllm
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "1"
# todo: please remove it when support controls garbage collection during CUDA graph capture.
os.environ["VLLM_ENABLE_CUDAGRAPH_GC"] = "1"

from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.utils import refresh_block_size
Expand Down Expand Up @@ -244,6 +246,12 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
data_parallel_size,
)
compilation_config.use_inductor = False
# NOTE: Theoretically, we should also add vllm::mla_forward in the attention ops.
# Since the process is created in the spawn mode, the value of the class attribute
# attention ops transmitted is still the one before modification, so it has not been modified.
# This will cause in scenarios where both piecewise and splitting ops are configured simultaneously,
# If splitting ops does not contain the vllm::mla forward value, this configuration issue will
# not be detected in advance assert.
compilation_config.splitting_ops.extend(["vllm::mla_forward"])
update_aclgraph_sizes(vllm_config)
ascend_config.enable_npugraph_ex = False
Expand Down
Loading
Loading