Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ jobs:
# spec_decode
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode/test_mtp_eagle_correctness.py
pytest -sv --durations=0 tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py

e2e-2-cards:
name: multicard-2
runs-on: linux-aarch64-a3-2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=bde38c11df0ea066a740efe9b77fff5418be45df
VLLM_COMMIT=11b6af5280d6d6dfb8953af16e67b25f819b3be9
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
vllm_version: [11b6af5280d6d6dfb8953af16e67b25f819b3be9, v0.13.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: bde38c11df0ea066a740efe9b77fff5418be45df
vllm: 11b6af5280d6d6dfb8953af16e67b25f819b3be9
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -81,7 +81,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
vllm_version: [11b6af5280d6d6dfb8953af16e67b25f819b3be9, v0.13.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -93,7 +93,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0]
vllm_version: [11b6af5280d6d6dfb8953af16e67b25f819b3be9, v0.13.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [bde38c11df0ea066a740efe9b77fff5418be45df]
vllm_version: [11b6af5280d6d6dfb8953af16e67b25f819b3be9]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | bde38c11df0ea066a740efe9b77fff5418be45df, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |
| main | 11b6af5280d6d6dfb8953af16e67b25f819b3be9, v0.13.0 tag | >= 3.10, < 3.12 | 8.3.RC2 | 2.8.0 / 2.8.0 |

## Release cadence

Expand Down
19 changes: 19 additions & 0 deletions tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
#

import os
import pytest

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import vllm_version_is

os.environ["HCCL_BUFFSIZE"] = "512"

Expand All @@ -44,10 +46,15 @@ def test_pcp_dcp_mtp1_eager():
"method": "deepseek_mtp",
},
enforce_eager=True,
async_scheduling=False,
) as runner:
runner.generate_greedy(prompts, 32)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32118 break this",
)
def test_pcp_dcp_mtp3_eager():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
Expand All @@ -68,10 +75,15 @@ def test_pcp_dcp_mtp3_eager():
"method": "deepseek_mtp",
},
enforce_eager=True,
async_scheduling=False,
) as runner:
runner.generate_greedy(prompts, 32)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32118 break this",
)
def test_pcp_dcp_mtp3_piecewise_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
Expand All @@ -95,10 +107,15 @@ def test_pcp_dcp_mtp3_piecewise_graph():
"cudagraph_mode": "PIECEWISE",
"cudagraph_capture_sizes": [4, 8, 16],
},
async_scheduling=False,
) as runner:
runner.generate_greedy(prompts, 32)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32118 break this",
)
def test_pcp_dcp_mtp3_full_graph():
prompts = [
"The capital of France is", "Hello, my name is Tom, I am",
Expand All @@ -122,6 +139,7 @@ def test_pcp_dcp_mtp3_full_graph():
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 16],
},
async_scheduling=False,
) as runner:
runner.generate_greedy(prompts, 32)

Expand All @@ -148,5 +166,6 @@ def test_dcp_mtp3_full_graph():
"cudagraph_mode": "FULL_DECODE_ONLY",
"cudagraph_capture_sizes": [4, 8, 16],
},
async_scheduling=False,
) as runner:
runner.generate_greedy(prompts, 32)
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_qwen3_next_mtp_acceptance_tp4(model_name):
for num_accepted_tokens in num_accepted_tokens_per_pos
]

match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
match = all((a >= b) or (b - a < 0.06) for a, b in zip(acceptance_per_pos, golden))
if not match:
print(f"acceptance_per_pos: {acceptance_per_pos}")
print(f"golden: {golden}")
Expand Down
15 changes: 14 additions & 1 deletion tests/e2e/singlecard/pooling/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import torch.nn.functional as F
from modelscope import snapshot_download # type: ignore[import-untyped]

from vllm_ascend.utils import vllm_version_is

from tests.e2e.conftest import HfRunner, VllmRunner

CROSS_ENCODER_MODELS = [
Expand Down Expand Up @@ -33,7 +35,10 @@
def model_name(request):
yield snapshot_download(request.param)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32148 changed the behavior of cross scoring",
)
def test_cross_encoder_score_1_to_1(model_name):
text_pair = [TEXTS_1[0], TEXTS_2[0]]

Expand All @@ -53,6 +58,10 @@ def test_cross_encoder_score_1_to_1(model_name):
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32148 changed the behavior of cross scoring",
)
def test_cross_encoder_score_1_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
Expand All @@ -76,6 +85,10 @@ def test_cross_encoder_score_1_to_N(model_name):
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)


@pytest.mark.skipif(
not vllm_version_is('0.13.0'),
reason="vLLM PR-32148 changed the behavior of cross scoring",
)
def test_cross_encoder_score_N_to_N(model_name):
text_pairs = [
[TEXTS_1[0], TEXTS_2[0]],
Expand Down
3 changes: 0 additions & 3 deletions tests/ut/attention/test_attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,11 @@ def setUp(self):
self.layer.layer_name = "test_layer"
self.layer._k_scale_float = 1.0
self.layer._v_scale_float = 1.0

self.attention_type = MagicMock()
self.attention_type.DECODER = "decoder"
self.attention_type.ENCODER = "encoder"

self.attn_metadata = MagicMock()
self.attn_metadata.return_value = "1"

self.layer_no_quant = MagicMock(
spec=['layer_name', '_k_scale_float', '_v_scale_float'])
self.layer_no_quant.layer_name = "test_layer"
Expand Down
1 change: 1 addition & 0 deletions tests/ut/ops/test_rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ def test_yarn_get_mscale(self, mock_npuplatform):
class TestAscendMRotaryEmbedding(unittest.TestCase):

def setUp(self):
# Common setup for tests
self.config_patcher = patch('vllm.config.vllm.get_current_vllm_config')
self.mock_get_config = self.config_patcher.start()
mock_config = MagicMock()
Expand Down
14 changes: 12 additions & 2 deletions tests/ut/test_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,21 @@

import pytest
import torch
from vllm.attention.selector import AttentionSelectorConfig
from vllm.config.compilation import CompilationMode, CUDAGraphMode
from vllm.platforms import PlatformEnum

from tests.ut.base import TestBase
from vllm_ascend.platform import NPUPlatform
from vllm_ascend.utils import (ASCEND_QUANTIZATION_METHOD,
COMPRESSED_TENSORS_METHOD, AscendDeviceType)
COMPRESSED_TENSORS_METHOD, AscendDeviceType,
vllm_version_is)

# isort: off
if vllm_version_is('0.13.0'):
from vllm.attention.selector import AttentionSelectorConfig # type: ignore
else:
from vllm.v1.attention.selector import AttentionSelectorConfig # type: ignore
# isort: on


class TestNPUPlatform(TestBase):
Expand All @@ -37,6 +44,9 @@ def mock_vllm_ascend_config():

def setUp(self):
self.platform = NPUPlatform()
self.platform.supported_quantization[:] = [
"ascend", "compressed-tensors"
]

def test_class_variables(self):
self.assertEqual(NPUPlatform._enum, PlatformEnum.OOT)
Expand Down
24 changes: 17 additions & 7 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,9 @@
import torch
import torch_npu
import vllm.envs as envs_vllm
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionLayer, AttentionType)
from vllm.attention.backends.registry import (AttentionBackendEnum,
register_backend)
from vllm.config import VllmConfig, get_current_vllm_config
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
AttentionMetadataBuilder)
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec

Expand All @@ -45,7 +39,23 @@
update_draft_graph_params_workspaces, update_graph_params_workspaces)
from vllm_ascend.device.device_op import DeviceOperator
from vllm_ascend.ops.flashcomm2_oshard_manager import flashcomm2_oshard_manager
from vllm_ascend.utils import weak_ref_tensors
from vllm_ascend.utils import vllm_version_is, weak_ref_tensors

# isort: off
if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import (AttentionCGSupport,
AttentionMetadataBuilder)
from vllm.attention.backends.abstract import ( # type: ignore
AttentionBackend, AttentionImpl, AttentionLayer, AttentionType)
from vllm.attention.backends.registry import ( # type: ignore
AttentionBackendEnum, register_backend)
else:
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, AttentionImpl, AttentionLayer,
AttentionType, AttentionMetadataBuilder)
from vllm.v1.attention.backends.registry import ( # type: ignore
AttentionBackendEnum, register_backend)
# isort: on

# default max value of sliding window size
SWA_INT_MAX = 2147483647
Expand Down
9 changes: 7 additions & 2 deletions vllm_ascend/attention/context_parallel/attention_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
get_decode_context_model_parallel_world_size,
get_pcp_group)
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec

from vllm_ascend.attention.attention_v1 import (AscendAttentionBackendImpl,
Expand All @@ -41,7 +40,13 @@
split_decodes_and_prefills)
from vllm_ascend.compilation.acl_graph import (get_graph_params,
update_graph_params_workspaces)
from vllm_ascend.utils import cp_chunkedprefill_comm_stream, weak_ref_tensors
from vllm_ascend.utils import (cp_chunkedprefill_comm_stream, vllm_version_is,
weak_ref_tensors)

if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionCGSupport
else:
from vllm.v1.attention.backend import AttentionCGSupport


class AscendAttentionCPMetadataBuilder(AscendAttentionMetadataBuilder):
Expand Down
8 changes: 6 additions & 2 deletions vllm_ascend/attention/context_parallel/mla_cp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
get_pcp_group)
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.utils.math_utils import cdiv
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

# isort: off
Expand All @@ -28,7 +27,12 @@
from vllm_ascend.compilation.acl_graph import (get_draft_graph_params,
get_graph_params,
update_graph_params_workspaces)
from vllm_ascend.utils import weak_ref_tensors
from vllm_ascend.utils import weak_ref_tensors, vllm_version_is

if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionCGSupport
else:
from vllm.v1.attention.backend import AttentionCGSupport

MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024

Expand Down
9 changes: 7 additions & 2 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
import torch
import torch_npu
import vllm.envs as envs_vllm
from vllm.attention.backends.abstract import AttentionBackend, MLAAttentionImpl
from vllm.config import VllmConfig, get_current_vllm_config
from vllm.forward_context import ForwardContext, get_forward_context
from vllm.logger import logger
from vllm.model_executor.layers.linear import UnquantizedLinearMethod
from vllm.utils.math_utils import cdiv, round_down
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.v1.kv_cache_interface import AttentionSpec, MLAAttentionSpec

from vllm_ascend import envs
Expand Down Expand Up @@ -44,10 +42,17 @@
if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput

# isort: off
if vllm_version_is('0.13.0'):
from vllm.v1.attention.backends.utils import AttentionCGSupport
from vllm.attention.backends.abstract import ( # type: ignore
AttentionBackend, MLAAttentionImpl)
from vllm.attention.backends.utils import PAD_SLOT_ID # type: ignore
else:
from vllm.v1.attention.backend import ( # type: ignore
AttentionBackend, AttentionCGSupport, MLAAttentionImpl)
from vllm.v1.attention.backends.utils import PAD_SLOT_ID # type: ignore
# isort: on

MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
BUILD_METADATA_STEP_PREFILL = 0
Expand Down
Loading
Loading