Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/_pre_commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ jobs:
repository: vllm-project/vllm
path: ./vllm-empty
ref: ${{ inputs.vllm }}

- uses: dorny/paths-filter@v3
id: filter
with:
Expand All @@ -62,10 +63,12 @@ jobs:
run: |
git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend
pre-commit run --all-files --hook-stage manual --show-diff-on-failure

- name: Run mypy
run: |
PYTHONPATH="$PYTHONPATH:$(pwd)/vllm-empty"
export PYTHONPATH
env
git config --global --add safe.directory /__w/vllm-ascend/vllm-ascend
# Run mypy for Python 3.10, 3.11, 3.12 manually
# Note: We are now separating mypy from pre-commit hooks for performance reasons.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=v0.15.0
VLLM_COMMIT=80f921ba4bab2ea251d149305ea0f912c6fc218a
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=v0.15.0
ARG VLLM_COMMIT=80f921ba4bab2ea251d149305ea0f912c6fc218a
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [v0.15.0]
vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: v0.15.0
vllm: 80f921ba4bab2ea251d149305ea0f912c6fc218a
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [v0.15.0]
vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [v0.15.0]
vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [v0.15.0]
vllm_version: [80f921ba4bab2ea251d149305ea0f912c6fc218a]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | 80f921ba4bab2ea251d149305ea0f912c6fc218a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
9 changes: 8 additions & 1 deletion tests/ut/eplb/core/test_eplb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from vllm_ascend.ascend_config import init_ascend_config
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
from vllm_ascend.utils import vllm_version_is
# isort: on


Expand All @@ -21,7 +22,13 @@ def setUp(self, mock_fix_incompatible_config):
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
}
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
moe_parallel_config = FusedMoEParallelConfig(2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
if vllm_version_is("0.15.0"):
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
else:
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl",
is_sequence_parallel=False, enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
Expand Down
16 changes: 14 additions & 2 deletions tests/ut/ops/test_mla.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,13 @@ def setUp(self):
@patch("vllm_ascend.ops.mla.get_tensor_model_parallel_world_size")
def test_initialization(self, mock_tp_size, mock_ascend_config,
mock_get_vllm_config):
# Create a proper mock for MLAAttention that has the required attributes
mock_mla_attn = MagicMock()
mock_mla_attn.process_weights_after_loading = MagicMock()
mock_mla_attn.impl = MagicMock()
mock_mla_attn.impl.process_weights_after_loading = MagicMock()

with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True):
with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn):
mock_tp_size.return_value = 2
mock_ascend_config.return_value.enable_shared_expert_dp = True
mock_vllm_config = MagicMock(spec=VllmConfig)
Expand Down Expand Up @@ -126,7 +131,14 @@ def test_forward(self, mock_get_forward_context, mock_tp_size,
num_hidden_layers=32, first_k_dense_replace=False)
mock_get_vllm_config.return_value = mock_vllm_config
mock_vllm_config.compilation_config = CompilationConfig()
with patch("vllm_ascend.ops.mla.MLAAttention", return_value=True):

# Create a proper mock for MLAAttention that has the required attributes
mock_mla_attn = MagicMock()
mock_mla_attn.process_weights_after_loading = MagicMock()
mock_mla_attn.impl = MagicMock()
mock_mla_attn.impl.process_weights_after_loading = MagicMock()

with patch("vllm_ascend.ops.mla.MLAAttention", return_value=mock_mla_attn):
attn = AscendMultiHeadLatentAttention(
hidden_size=self.hidden_size,
num_heads=self.num_heads,
Expand Down
8 changes: 6 additions & 2 deletions tests/ut/quantization/test_modelslim_config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from unittest.mock import MagicMock, patch

from vllm.attention.layer import Attention
from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.linear import LinearBase

from tests.ut.base import TestBase
from vllm_ascend.ops.linear import AscendUnquantizedLinearMethod
from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
Comment thread
zhangxinyuehfad marked this conversation as resolved.


class TestAscendModelSlimConfig(TestBase):
Expand Down
12 changes: 12 additions & 0 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None

self.mock_cpugpubuffer = patch(
Expand Down Expand Up @@ -141,12 +144,15 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

Expand Down Expand Up @@ -285,12 +291,15 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.model_config.use_mla = False
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(4)
])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

Expand Down Expand Up @@ -404,12 +413,15 @@ def setUp(self):
self.vllm_config.model_config.dtype = torch.float16
self.vllm_config.model_config.max_model_len = 2048
self.vllm_config.model_config.uses_mrope = False
self.vllm_config.model_config.uses_xdrope_dim = 0
self.vllm_config.parallel_config.tensor_parallel_size = 1
self.vllm_config.parallel_config.data_parallel_rank = 0
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

Expand Down
3 changes: 3 additions & 0 deletions tests/ut/spec_decode/test_mtp_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def vllm_config(self):
config.speculative_config.draft_model_config = MagicMock()
config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
config.speculative_config.draft_model_config.uses_mrope = False
config.speculative_config.draft_model_config.uses_xdrope_dim = 0
config.speculative_config.speculative_token_tree = str([
(i + 1) * (0, ) for i in range(2)
])
Expand All @@ -42,9 +43,11 @@ def vllm_config(self):
config.model_config.dtype = torch.float16
config.model_config.max_model_len = 2048
config.model_config.uses_mrope = False
config.model_config.uses_xdrope_dim = 0
config.model_config.hf_text_config = None
config.model_config.hf_config = None
config.parallel_config.tensor_parallel_size = 1
config.parallel_config.data_parallel_rank = 0
config.speculative_config.draft_tensor_parallel_size = 1

config.load_config = None
Expand Down
22 changes: 22 additions & 0 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1450,6 +1450,28 @@ def _mla_preprocess(self, layer_name, hidden_states, kv_cache, attn_metadata, ne
def get_num_actual_tokens(self, attn_metadata: M):
return attn_metadata.num_actual_tokens

def forward_mha(
self,
layer_name: str,
hidden_states: torch.Tensor,
kv_cache: tuple[torch.Tensor],
attn_metadata: M,
need_gather_q_kv: bool = False,
output: torch.Tensor | None = None,
) -> torch.Tensor:
raise NotImplementedError("forward_mha is not supported for MLA attention. Use forward() instead.")

def forward_mqa(
self,
layer_name: str,
hidden_states: torch.Tensor,
kv_cache: tuple[torch.Tensor],
attn_metadata: M,
need_gather_q_kv: bool = False,
output: torch.Tensor | None = None,
) -> torch.Tensor:
raise NotImplementedError("forward_mqa is not supported for MLA attention. Use forward() instead.")

def forward(
self,
layer_name,
Expand Down
21 changes: 21 additions & 0 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,3 +1062,24 @@ def _handle_o_proj_weight_switch_and_forward(
torch.distributed.all_to_all_single(attn_output, send, group=get_tp_group().device_group)

return attn_output, True

def forward_mha(
self,
q: torch.Tensor,
kv_c_normed: torch.Tensor,
k_pe: torch.Tensor,
kv_c_and_k_pe_cache: torch.Tensor,
attn_metadata: M,
k_scale: torch.Tensor,
output: torch.Tensor,
) -> None:
raise NotImplementedError("forward_mha is not supported for SFA attention. Use forward() instead.")

def forward_mqa(
self,
q: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
kv_c_and_k_pe_cache: torch.Tensor,
attn_metadata: M,
layer,
) -> tuple[torch.Tensor, torch.Tensor | None]:
raise NotImplementedError("forward_mqa is not supported for SFA attention. Use forward() instead.")
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import torch
import torchair
from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.config.compilation import Range
from vllm.logger import logger
Expand All @@ -27,6 +26,12 @@
check_and_register_fusion_pass,
extra_stream_scope_check,
)
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
Comment thread
zhangxinyuehfad marked this conversation as resolved.


class GraphEXQKNormRopeFusionPattern:
Expand Down
8 changes: 7 additions & 1 deletion vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,18 @@
import torch
import torch._inductor.pattern_matcher as pm
from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter
from vllm.attention.layer import Attention
from vllm.compilation.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.config.compilation import Range
from vllm.logger import logger

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
Comment thread
zhangxinyuehfad marked this conversation as resolved.


class QKNormRopeFusionPattern:
def __init__(self, vllm_config, head_dim, num_heads, num_kv_heads, eps=1e-6):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import TYPE_CHECKING, Any, Optional

import torch
from vllm.attention.layer import Attention, MLAAttention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.distributed.ec_transfer import get_ec_transfer, has_ec_transfer
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole
Expand All @@ -27,6 +26,7 @@
MetadataServerProc,
MLAConfig,
)
from vllm_ascend.utils import vllm_version_is

if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
Expand All @@ -35,6 +35,11 @@
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention, MLAAttention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention, MLAAttention
Comment thread
zhangxinyuehfad marked this conversation as resolved.


@dataclass
class ReqMeta:
Expand Down
Loading
Loading