Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.15.0"
default: "v0.16.0"
type: string
description: vllm version to use
vllm_ascend_remote_url:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
VLLM_COMMIT=b3c14229b032a8bbf93d450a52c9a404ddaea429
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> "$GITHUB_ENV"

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=83b47f67b1dfad505606070ae4d9f83e50ad4ebd
ARG VLLM_COMMIT=b3c14229b032a8bbf93d450a52c9a404ddaea429
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: 83b47f67b1dfad505606070ae4d9f83e50ad4ebd
vllm: b3c14229b032a8bbf93d450a52c9a404ddaea429
changes:
runs-on: linux-aarch64-a2b3-0
outputs:
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0]
vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [83b47f67b1dfad505606070ae4d9f83e50ad4ebd]
vllm_version: [b3c14229b032a8bbf93d450a52c9a404ddaea429]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_nightly_test_a2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ jobs:
- Qwen3-Omni-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with:
vllm: v0.15.0
vllm: v0.16.0
runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_test_benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.15.0
- vllm_branch: v0.16.0
vllm_ascend_branch: main
max-parallel: 1
container:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ RUN apt-get update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN yum update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN yum update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.15.0
ARG VLLM_TAG=v0.16.0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | 83b47f67b1dfad505606070ae4d9f83e50ad4ebd, v0.15.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | b3c14229b032a8bbf93d450a52c9a404ddaea429, v0.16.0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
# CANN image tag
"cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11",
# vllm version in ci
"ci_vllm_version": "v0.15.0",
"ci_vllm_version": "v0.16.0",
}

# For cross-file header anchors
Expand Down
7 changes: 1 addition & 6 deletions tests/e2e/singlecard/compile/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,10 @@

import torch.fx as fx
from torch._inductor.decomposition import select_decomp_table
from vllm.compilation.passes.fx_utils import OpOverload
from vllm.config import get_current_vllm_config

from vllm_ascend.compilation.compiler_interface import compile_fx
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.fx_utils import OpOverload # type: ignore
else:
from vllm.compilation.passes.fx_utils import OpOverload


class TestBackend:
Expand Down
9 changes: 2 additions & 7 deletions tests/e2e/singlecard/compile/test_norm_quant_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import torch
import torch.nn as nn
import vllm.config
from vllm.compilation.passes.fx_utils import OpOverload
from vllm.config import ModelConfig, VllmConfig
from vllm.distributed import ensure_model_parallel_initialized, init_distributed_environment
from vllm.utils.system_utils import update_environment_variables
Expand All @@ -27,13 +28,7 @@
from tests.e2e.singlecard.compile.backend import TestBackend
from vllm_ascend.ascend_forward_context import set_ascend_forward_context
from vllm_ascend.compilation.passes.norm_quant_fusion_pass import AddRMSNormQuantFusionPass
from vllm_ascend.utils import enable_custom_op, vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.fx_utils import OpOverload # type: ignore
else:
from vllm.compilation.passes.fx_utils import OpOverload

from vllm_ascend.utils import enable_custom_op

# Cache backend to avoid duplicate pattern registration
_backend_cache = None
Expand Down
4 changes: 2 additions & 2 deletions tests/ut/eplb/core/test_eplb_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ def setUp(self, mock_fix_incompatible_config):
"eplb_config": {"dynamic_eplb": True, "num_redundant_experts": 2},
}
from vllm.model_executor.layers.fused_moe.config import RoutingMethodType
if vllm_version_is("0.15.0"):
if vllm_version_is("0.16.0"):
moe_parallel_config = FusedMoEParallelConfig(
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", enable_eplb=True)
2, 0, 1, 2, 1, 1, 1, 1, True, "hccl", is_sequence_parallel=True, enable_eplb=True)
moe_config = FusedMoEConfig(
num_experts=8,
experts_per_token=8,
Expand Down
5 changes: 1 addition & 4 deletions tests/ut/quantization/test_modelslim_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@
from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, vllm_version_is

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention
from vllm.model_executor.layers.attention import Attention


class TestAscendModelSlimConfig(TestBase):
Expand Down
5 changes: 0 additions & 5 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
is_drafter_moe_model,
is_moe_model,
speculative_enable_dispatch_gmm_combine_decode,
vllm_version_is,
)


Expand Down Expand Up @@ -152,10 +151,6 @@ def set_ascend_forward_context(
mc2_mask[:num_actual_tokens] = True
mc2_mask[num_actual_tokens:] = False
forward_context.mc2_mask = mc2_mask

if is_draft_model and vllm_version_is("0.15.0"):
forward_context.remaining_moe_layers = None

try:
yield
finally:
Expand Down
11 changes: 2 additions & 9 deletions vllm_ascend/compilation/graph_fusion_pass_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,10 @@
#

from torch import fx as fx
from vllm.compilation.passes.inductor_pass import get_pass_context
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig

from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.inductor_pass import get_pass_context # type: ignore
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
else:
from vllm.compilation.passes.inductor_pass import get_pass_context
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass


class GraphFusionPassManager:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#
import torch
from torch._inductor.pattern_matcher import Match, PatternMatcherPass, PatternPrettyPrinter
from vllm.compilation.passes.inductor_pass import get_pass_context
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig
from vllm.config.compilation import Range
from vllm.distributed import get_tensor_model_parallel_world_size, tensor_model_parallel_all_reduce
Expand All @@ -24,14 +26,6 @@

from vllm_ascend.compilation.passes.base_pattern import BasePattern
from vllm_ascend.compilation.passes.utils.npugraph_ex_utils_check import extra_stream_scope_check
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.inductor_pass import get_pass_context # type: ignore
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
else:
from vllm.compilation.passes.inductor_pass import get_pass_context
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass

# computation-communication tiling block is 512
ALLREDUCE_NORM_FUSE_THRESHOLD = 512
Expand Down
8 changes: 2 additions & 6 deletions vllm_ascend/compilation/passes/norm_quant_fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,13 @@
#
import torch
from torch._inductor.pattern_matcher import PatternMatcherPass
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig
from vllm.config.compilation import Range
from vllm.logger import logger

from vllm_ascend.compilation.passes.base_pattern import BasePattern
from vllm_ascend.utils import enable_custom_op, vllm_version_is

if vllm_version_is("0.15.0"):
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
else:
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm_ascend.utils import enable_custom_op


class AddRMSNormQuantPattern(BasePattern):
Expand Down
10 changes: 2 additions & 8 deletions vllm_ascend/compilation/passes/qknorm_rope_fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,13 @@
#
import torch
from torch._inductor.pattern_matcher import PatternMatcherPass, PatternPrettyPrinter
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.config.compilation import Range
from vllm.logger import logger
from vllm.model_executor.layers.attention import Attention

from vllm_ascend.compilation.passes.base_pattern import BasePattern
from vllm_ascend.utils import vllm_version_is

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention # type: ignore
from vllm.compilation.vllm_inductor_pass import VllmInductorPass # type: ignore
else:
from vllm.compilation.passes.vllm_inductor_pass import VllmInductorPass
from vllm.model_executor.layers.attention import Attention


class QKNormRopeFusionPattern(BasePattern):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
MetadataServerProc,
MLAConfig,
)
from vllm_ascend.utils import vllm_version_is

if TYPE_CHECKING:
from vllm.forward_context import ForwardContext
Expand All @@ -35,10 +34,7 @@
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request

if vllm_version_is("v0.15.0"):
from vllm.attention.layer import Attention, MLAAttention # type: ignore
else:
from vllm.model_executor.layers.attention import Attention, MLAAttention
from vllm.model_executor.layers.attention import Attention, MLAAttention


@dataclass
Expand Down
Loading