Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_nightly_multi_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ on:
description: how many pods will be pulled up via lws.yaml, indicates number of nodes we need
vllm_version:
required: false
default: "v0.14.1"
default: "v0.15.0rc0"
type: string
description: vllm version to use
vllm_ascend_remote_url:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/bot_pr_create.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
steps:
- name: Get vLLM version
run: |
VLLM_COMMIT=dc917cceb877dfd13f98c538c4c96158047d98bd
VLLM_COMMIT=cf1167e50b809f18efd21fb3418dd75d2805b14f
echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV

- name: Checkout repository
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_COMMIT=dc917cceb877dfd13f98c538c4c96158047d98bd
ARG VLLM_COMMIT=cf1167e50b809f18efd21fb3418dd75d2805b14f
RUN git clone $VLLM_REPO /vllm-workspace/vllm && \
cd /vllm-workspace/vllm && \
git checkout $VLLM_COMMIT
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1]
vllm_version: [cf1167e50b809f18efd21fb3418dd75d2805b14f, v0.15.0rc0]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: dc917cceb877dfd13f98c538c4c96158047d98bd
vllm: cf1167e50b809f18efd21fb3418dd75d2805b14f
changes:
runs-on: linux-aarch64-a2-0
outputs:
Expand Down Expand Up @@ -87,7 +87,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
strategy:
matrix:
vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1]
vllm_version: [cf1167e50b809f18efd21fb3418dd75d2805b14f, v0.15.0rc0]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -99,7 +99,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1]
vllm_version: [cf1167e50b809f18efd21fb3418dd75d2805b14f, v0.15.0rc0]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_codecov_refresh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
name: refresh codecov
strategy:
matrix:
vllm_version: [dc917cceb877dfd13f98c538c4c96158047d98bd]
vllm_version: [cf1167e50b809f18efd21fb3418dd75d2805b14f]
uses: ./.github/workflows/_unit_test.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_nightly_test_a2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ jobs:
- Qwen3-Omni-30B-A3B-Instruct
uses: ./.github/workflows/_e2e_nightly_single_node_models.yaml
with:
vllm: v0.14.1
vllm: v0.15.0rc0
runner: ${{ matrix.test_config.os }}
model_list: ${{ toJson(matrix.test_config.model_list) }}
image: 'swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-910b-ubuntu22.04-py3.11'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_test_benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
strategy:
matrix:
include:
- vllm_branch: v0.14.1
- vllm_branch: v0.15.0rc0
vllm_ascend_branch: main
max-parallel: 1
container:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ COPY . /vllm-workspace/vllm-ascend/

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ RUN apt-get update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN yum update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN yum update -y && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.14.1
ARG VLLM_TAG=v0.15.0rc0
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion docs/source/community/versioning_policy.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ For main branch of vLLM Ascend, we usually make it compatible with the latest vL

| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu |
|-------------|--------------|------------------|-------------|--------------------|
| main | dc917cceb877dfd13f98c538c4c96158047d98bd, v0.14.1 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |
| main | cf1167e50b809f18efd21fb3418dd75d2805b14f, v0.15.0rc0 tag | >= 3.10, < 3.12 | 8.5.0 | 2.9.0 / 2.9.0 |

## Release cadence

Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
# the branch of vllm, used in vllm clone
# - main branch: 'main'
# - vX.Y.Z branch: 'vX.Y.Z'
"vllm_version": "v0.14.1",
"vllm_version": "v0.15.0rc0",
# the branch of vllm-ascend, used in vllm-ascend clone and image tag
# - main branch: 'main'
# - vX.Y.Z branch: latest vllm-ascend release tag
Expand All @@ -77,7 +77,7 @@
# CANN image tag
"cann_image_tag": "8.5.0-910b-ubuntu22.04-py3.11",
# vllm version in ci
"ci_vllm_version": "v0.14.1",
"ci_vllm_version": "v0.15.0rc0",
}

# For cross-file header anchors
Expand Down
4 changes: 1 addition & 3 deletions vllm_ascend/ascend_forward_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
is_drafter_moe_model,
is_moe_model,
speculative_enable_dispatch_gmm_combine_decode,
vllm_version_is,
)


Expand Down Expand Up @@ -59,8 +58,7 @@ def set_ascend_forward_context(
"batch_descriptor": batch_descriptor,
}

if not vllm_version_is("0.14.1"):
forward_context_kwargs["skip_compiled"] = skip_compiled
forward_context_kwargs["skip_compiled"] = skip_compiled

with set_forward_context(**forward_context_kwargs):
forward_context = get_forward_context()
Expand Down
7 changes: 7 additions & 0 deletions vllm_ascend/attention/attention_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@
class AscendAttentionBackend(AttentionBackend):
accept_output_buffer: bool = True

forward_includes_kv_cache_update: bool = True

@staticmethod
def get_name() -> str:
# HACK(Ronald1995): vllm `initialize_kv_cache` method in model runner v2 make
Expand Down Expand Up @@ -868,6 +870,9 @@ def reshape_and_cache(
self.key_cache, self.value_cache = kv_cache[0], kv_cache[1]
slots = attn_metadata.slot_mapping
encoder_decoder = self.attn_type == AttentionType.ENCODER_DECODER

slots = slots.to(torch.int32)

DeviceOperator.reshape_and_cache(
key=key[: attn_metadata.num_actual_tokens] if not encoder_decoder else key,
value=value[: attn_metadata.num_actual_tokens] if not encoder_decoder else value,
Expand Down Expand Up @@ -932,8 +937,10 @@ def forward(
num_tokens = query.shape[0]
if attn_metadata is None:
return output.fill_(0)

if key is not None and value is not None:
key, value = self.reshape_and_cache(key, value, kv_cache, attn_metadata)

# pooling model branch
if attn_metadata.model_runner_type == "pooling":
attn_output = self._forward_encoder_attention(query, key, value, attn_metadata, output)
Expand Down
7 changes: 2 additions & 5 deletions vllm_ascend/attention/mla_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,14 @@
from vllm_ascend.ops.rotary_embedding import get_cos_and_sin_mla
from vllm_ascend.ops.weight_prefetch import maybe_npu_prefetch
from vllm_ascend.quantization.methods import AscendW8A8LinearMethod
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, vllm_version_is, weak_ref_tensors
from vllm_ascend.utils import ACL_FORMAT_FRACTAL_ND, maybe_trans_nz, weak_ref_tensors
from vllm_ascend.worker.npu_input_batch import NPUInputBatch

if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput

# isort: off
if vllm_version_is("0.14.1"):
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
# isort: on

MAX_O_PROJ_PREFETCH_SIZE = 16 * 1024 * 1024
Expand Down
7 changes: 2 additions & 5 deletions vllm_ascend/attention/sfa_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,13 @@
enable_dsa_cp,
enable_dsa_cp_with_layer_shard,
maybe_trans_nz,
vllm_version_is,
)
from vllm_ascend.worker.npu_input_batch import NPUInputBatch

if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput
if vllm_version_is("0.14.1"):
from vllm.v1.attention.backends.mla.common import MLACommonMetadataBuilder # type: ignore
else:
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder
from vllm.model_executor.layers.attention.mla_attention import MLACommonMetadataBuilder

# isort: on

# token count limits within bmm_transpose operator
Expand Down
12 changes: 12 additions & 0 deletions vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,15 @@ def forward(
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
if self._shared_experts is None:
fused_out = AscendFusedMoE.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
shared_out = None
return shared_out, fused_out

shared_out, fused_out = AscendFusedMoE.forward(
self,
hidden_states=hidden_states,
Expand Down Expand Up @@ -571,6 +580,9 @@ def forward_impl( # type: ignore[override]
)
routed_out = fused_moe_results.routed_out

if self._shared_experts is None:
return routed_out

if self.multistream_overlap_gate:
fc3_context = get_flash_common3_context()
assert fc3_context is not None
Expand Down
16 changes: 6 additions & 10 deletions vllm_ascend/ops/mm_encoder_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def __init__(
scale: float | None = None,
num_kv_heads: int | None = None,
prefix: str = "",
multimodal_config: MultiModalConfig | None = None,
) -> None:
"""
Args:
Expand All @@ -48,15 +47,13 @@ def __init__(
num_kv_heads: number of kv heads.
prefix: This has no effect, it is only here to make it easier to
swap between Attention and MMEncoderAttention.
multimodal_config: configs for multi-modal.
"""
super().__init__(
num_heads=num_heads,
head_size=head_size,
scale=scale,
num_kv_heads=num_kv_heads,
prefix=prefix,
multimodal_config=multimodal_config,
)

def reshape_qkv_to_3d(
Expand Down Expand Up @@ -84,13 +81,12 @@ def reshape_qkv_to_3d(
return query, key, value

def forward_oot(
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor
| None = None, # Only used for Flash Attention
self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
cu_seqlens: torch.Tensor | None = None,
max_seqlen: torch.Tensor | None = None, # Only used for Flash Attention
):
bsz, q_len = query.size()[:2]
kv_len = key.size(1)
Expand Down
5 changes: 1 addition & 4 deletions vllm_ascend/patch/platform/patch_multiproc_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
set_multiprocessing_worker_envs,
)

from vllm_ascend.utils import vllm_version_is


class AscendMultiprocExecutor(MultiprocExecutor):
def _init_executor(self) -> None:
Expand Down Expand Up @@ -178,8 +176,7 @@ def make_worker_process(
"death_pipe": death_reader,
"shared_worker_lock": shared_worker_lock,
}
if not vllm_version_is("0.14.1"):
process_kwargs["is_driver_worker"] = is_driver_worker
process_kwargs["is_driver_worker"] = is_driver_worker
# Run EngineCore busy loop in background process.
proc = context.Process(
target=WorkerProc.worker_main,
Expand Down