Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error

e2e-full:
name: singlecard-full
Expand Down Expand Up @@ -145,7 +145,7 @@ jobs:
VLLM_WORKER_MULTIPROC_METHOD: spawn
PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
run: |
python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
python3 .github/workflows/scripts/run_suite.py --suite e2e-singlecard --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --continue-on-error

e2e-2-cards-light:
name: multicard-2-light
Expand Down Expand Up @@ -209,7 +209,7 @@ jobs:
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
python3 .github/workflows/scripts/run_suite.py --suite e2e-2card-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
python3 .github/workflows/scripts/run_suite.py --suite e2e-2card-light --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error

e2e-2-cards-full:
name: multicard-2-full
Expand Down Expand Up @@ -273,7 +273,7 @@ jobs:
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-2-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-2-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error

- name: Run vllm-project/vllm-ascend test (non triton)
if: ${{ inputs.type == 'full' && matrix.part == 0 }}
Expand Down Expand Up @@ -345,7 +345,7 @@ jobs:
env:
VLLM_WORKER_MULTIPROC_METHOD: spawn
run: |
python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-4-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
python3 .github/workflows/scripts/run_suite.py --suite e2e-multicard-4-cards --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --continue-on-error

e2e_310p:
name: 310p singlecard
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
VLLM_COMMIT=c4df59ad43037a846eed353ce4c17dc264d18f4a
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
ARG VLLM_COMMIT=c4df59ad43037a846eed353ce4c17dc264d18f4a
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a
vllm: c4df59ad43037a846eed353ce4c17dc264d18f4a
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0]
vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a]
vllm_version: [c4df59ad43037a846eed353ce4c17dc264d18f4a]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | d7e17aaacd5ed1b4b4be6bcfef3a1b7cbc84fc9a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | c4df59ad43037a846eed353ce4c17dc264d18f4a, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
14 changes: 7 additions & 7 deletions tests/e2e/multicard/2-cards/test_aclgraph_capture_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
import torch
from vllm.utils.network_utils import get_open_port

from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type
from tests.e2e.conftest import wait_until_npu_memory_free
from vllm_ascend.utils import AscendDeviceType, get_ascend_device_type

MODELS = [
# Offline data parallel mode will be not supported/useful for dense models
Expand Down Expand Up @@ -85,8 +85,7 @@ def _run_worker_process(

# Import vLLM only after environment setup
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import (
destroy_distributed_environment, destroy_model_parallel)
from vllm.distributed.parallel_state import destroy_distributed_environment, destroy_model_parallel

# Apply hooks and run inference
with _install_spies(counters):
Expand Down Expand Up @@ -208,8 +207,9 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
expected_exec_model = (total_steps + 1 + 1) * dp_size

assert (
num_execute_model == expected_exec_model
), f"Model execution count mismatch. Expected: {expected_exec_model}, Got: {num_execute_model}"
expected_exec_model - dp_size < num_execute_model <= expected_exec_model
), f"Model execution count mismatch. Expected range: [{expected_exec_model - dp_size}, \
{expected_exec_model}], Got: {num_execute_model}"

# Metric 3: Dummy Runs (Warmup & Alignment)
# vLLM synchronizes globally every 32 steps.
Expand All @@ -229,8 +229,8 @@ def test_models_aclgraph_capture_replay_metrics_dp2(
expected_dummy_run = (warmup_runs + padding_runs) * dp_size

assert (
num_dummy_run == expected_dummy_run
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}"
expected_dummy_run <= num_dummy_run <= expected_dummy_run + dp_size
), f"Dummy run count mismatch. Expected: {expected_dummy_run}, Got: {num_dummy_run}, Tolerance: ±{dp_size}"

# Metric 4: Graph Replay (Inference Execution)
# Replays happen for every aligned step across all graphs.
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/singlecard/compile/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

import torch.fx as fx
from torch._inductor.decomposition import select_decomp_table
from vllm.compilation.fx_utils import OpOverload
from torch._ops import OpOverload
from vllm.config import get_current_vllm_config

from vllm_ascend.compilation.compiler_interface import compile_fx
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/singlecard/compile/test_norm_quant_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import torch.nn as nn
import torch_npu
import vllm.config
from vllm.compilation.fx_utils import OpOverload
from torch._ops import OpOverload
from vllm.config import ModelConfig, VllmConfig
from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment)
Expand Down
25 changes: 17 additions & 8 deletions tests/e2e/singlecard/test_llama32_lora.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from unittest.mock import patch

import vllm
import vllm.config
from vllm.lora.request import LoRARequest
from unittest.mock import patch

from tests.e2e.conftest import VllmRunner
from vllm_ascend.utils import enable_custom_op
from vllm_ascend.utils import enable_custom_op, vllm_version_is

enable_custom_op()

Expand All @@ -23,12 +24,20 @@
###Response:<|eot_id|><|start_header_id|>assistant<|end_header_id|>
""" # noqa: E501

EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM candidate",
"SELECT count(*) FROM candidate",
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
]
if vllm_version_is("0.15.0"):
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM candidate",
"SELECT count(*) FROM candidate",
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
"SELECT poll_source FROM candidate GROUP BY poll_source ORDER BY count(*) DESC LIMIT 1", # noqa: E501
]
else:
EXPECTED_LORA_OUTPUT = [
"SELECT COUNT(*) FROM candidate",
"SELECT COUNT(*) FROM candidate",
"SELECT Poll_Source FROM candidate GROUP BY Poll_Source ORDER BY COUNT(*) DESC LIMIT 1;",
"SELECT t1.Poll_Source FROM candidate AS t1 JOIN people AS t2 ON t1.People_ID = t2.People_ID GROUP BY t1.Poll_Source ORDER BY COUNT(*) DESC LIMIT 1", # noqa: E501
]

EXPECTED_BASE_MODEL_OUTPUT = [
"SELECT COUNT(*) FROM candidate",
Expand Down
Loading
Loading