Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/dockerfiles/Dockerfile.lint
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ RUN apt-get update -y && \

ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
# For lint purpose, actually we need make a main2main matching.
ARG VLLM_TAG=v0.20.1
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
ARG VLLM_COMMIT=4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
RUN git init /vllm-workspace/vllm && \
git -C /vllm-workspace/vllm fetch --depth 1 $VLLM_REPO $VLLM_COMMIT && \
git -C /vllm-workspace/vllm checkout FETCH_HEAD

# # Install vLLM common dependencies
RUN python3 -m pip install -r /vllm-workspace/vllm/requirements/common.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand All @@ -102,7 +102,7 @@ jobs:
strategy:
fail-fast: false
matrix:
vllm_version: [v0.20.1]
vllm_version: [v0.20.2]
needs: [parse-trigger]
if: ${{ needs.parse-trigger.outputs.allowed == 'true' }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: c7aa186d67b6f051680831418e957c67f34ba7a2
vllm: 4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef
changes:
runs-on: linux-aarch64-a2b3-0
container:
Expand Down Expand Up @@ -155,7 +155,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -165,7 +165,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [c7aa186d67b6f051680831418e957c67f34ba7a2, v0.20.1]
vllm_version: [4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef, v0.20.2]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
8 changes: 0 additions & 8 deletions .github/workflows/schedule_lint_image_build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@ on:
# Runs at 00:00 UTC+8 every day
- cron: '0 20 * * *'
workflow_dispatch:
inputs:
vllm_hash:
description: 'vLLM base hash'
default: main
required: true
type: string
push:
paths:
- '.github/workflows/dockerfiles/Dockerfile.lint'
Expand Down Expand Up @@ -85,5 +79,3 @@ jobs:
labels: ${{ steps.meta.outputs.labels }}
tags: ${{ steps.meta.outputs.tags }}
provenance: false
build-args: |
VLLM_HASH=${{ inputs.vllm_hash }}
2 changes: 1 addition & 1 deletion .github/workflows/schedule_update_estimated_time.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
name: e2e-test
strategy:
matrix:
vllm_version: [v0.20.1]
vllm_version: [v0.20.2]
type: [full, light]
uses: ./.github/workflows/_e2e_test.yaml
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_vllm_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
vllm: [v0.20.1]
vllm: [v0.20.2]
container:
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:9.0.0-910b-ubuntu22.04-py3.11
env:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.310p.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.a3.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.openEuler
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL} && \

# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.20.1
ARG VLLM_TAG=v0.20.2
RUN git clone --depth 1 -b $VLLM_TAG $VLLM_REPO /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/[audio] --extra-index https://download.pytorch.org/whl/cpu/ && \
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@
# CANN image tag
"cann_image_tag": "9.0.0-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "c7aa186d67b6f051680831418e957c67f34ba7a2",
"main_vllm_commit": "4e498b5e5c07480cfb8c046128f0ef8d9a60d8ef",
# vLLM tag for main branch
"main_vllm_tag": "v0.20.1",
"main_vllm_tag": "v0.20.2",
# Python version for main branch
"main_python_version": ">= 3.10, < 3.12",
# CANN version for main branch
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/multicard/2-cards/test_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def test_qwen3_moe_distributed_aiv_tp2():
vllm_model.generate_greedy(example_prompts, max_tokens)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [True])
@patch.dict(os.environ, {"VLLM_USE_V2_MODEL_RUNNER": "1"})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_qwen3_moe_routing_replay():
cudagraph_capture_sizes=[1, 2, 4, 8],
distributed_executor_backend="mp",
enable_return_routed_experts=True,
async_scheduling=False,
) as vllm_model:
sampling_params = SamplingParams(
max_tokens=5, temperature=0.8, top_p=0.95, output_kind=RequestOutputKind.FINAL_ONLY
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/singlecard/model_runner_v2/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_qwen3_dense_eager_mode(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
@pytest.mark.parametrize("model", MAIN_MODELS)
@pytest.mark.parametrize("eagle_model", EGALE_MODELS)
@pytest.mark.parametrize("max_tokens", [32])
Expand Down Expand Up @@ -104,7 +104,7 @@ def test_egale_spec_decoding(
runner.model.generate(prompts, sampling_params)


@pytest.mark.skipif(vllm_version_is("0.20.1"), reason="no need to support model_runner for v0.20.1")
@pytest.mark.skipif(vllm_version_is("0.20.2"), reason="no need to support model_runner for v0.20.2")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("enforce_eager", [False])
Expand Down
10 changes: 8 additions & 2 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1385,10 +1385,13 @@ def check_mock(self):
"method",
"parallel_drafting",
"draft_tensor_parallel_size",
"speculative_token_tree",
"draft_model_config",
"disable_padded_drafter_batch",
}
# speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
# only check for it when the installed version still carries the field.
if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
fields.add("speculative_token_tree")

actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
missing = fields - actual
Expand Down Expand Up @@ -2260,10 +2263,13 @@ def check_mock(self):
"enforce_eager",
"use_local_argmax_reduction",
"draft_tensor_parallel_size",
"speculative_token_tree",
"draft_model_config",
"disable_padded_drafter_batch",
}
# speculative_token_tree was removed in newer vllm (Remove tree attention #42121);
# only check for it when the installed version still carries the field.
if "speculative_token_tree" in vllm.config.SpeculativeConfig.__dataclass_fields__:
fields.add("speculative_token_tree")
actual = set(vllm.config.SpeculativeConfig.__dataclass_fields__)
missing = fields - actual
assert not missing, f"Missing dataclass fields: {missing}"
Expand Down
4 changes: 2 additions & 2 deletions vllm_ascend/core/scheduler_profiling_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def schedule(self) -> SchedulerOutput: # noqa: C901
num_encoder_tokens = sum(request.get_num_encoder_embeds(i) for i in encoder_inputs_to_schedule)

if (
vllm_version_is("0.20.1")
vllm_version_is("0.20.2")
and self.scheduler_reserve_full_isl
and not self.kv_cache_manager.can_fit_full_sequence(
request,
Expand All @@ -601,7 +601,7 @@ def schedule(self) -> SchedulerOutput: # noqa: C901
delay_cache_blocks=load_kv_async,
num_encoder_tokens=num_encoder_tokens,
**(
{} if vllm_version_is("0.20.1") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
{} if vllm_version_is("0.20.2") else {"full_sequence_must_fit": self.scheduler_reserve_full_isl}
),
)

Expand Down
14 changes: 7 additions & 7 deletions vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
from vllm.logger import logger
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod, get_compressed_expert_map
from vllm.model_executor.layers.fused_moe.routed_experts_capturer import RoutedExpertsCapturer
from vllm.model_executor.layers.fused_moe.runner.moe_runner import MoERunner # type: ignore

import vllm_ascend.envs as envs_ascend
Expand All @@ -36,6 +35,7 @@
from vllm_ascend.distributed.parallel_state import get_mc2_group
from vllm_ascend.eplb.core.eplb_utils import init_eplb_config
from vllm_ascend.flash_common3_context import get_flash_common3_context, set_flash_common3_context
from vllm_ascend.ops.fused_moe import routed_experts_compat
from vllm_ascend.ops.fused_moe.experts_selector import select_experts, zero_experts_compute
from vllm_ascend.ops.fused_moe.moe_comm_method import AllGatherCommImpl, FusedExpertsResult, setup_moe_comm_method
from vllm_ascend.ops.fused_moe.moe_runtime_args import build_fused_experts_input
Expand Down Expand Up @@ -159,12 +159,12 @@ def apply(
num_experts=num_logical_experts,
)
if layer.vllm_config.model_config is not None and layer.vllm_config.model_config.enable_return_routed_experts:
capturer = RoutedExpertsCapturer.get_instance()
if capturer is not None:
capturer.capture(
layer_id=layer.layer_id,
topk_ids=topk_ids,
)
capturer = routed_experts_compat.get_capturer()
routed_experts_compat.call_capture(
capturer,
layer_id=layer.layer_id,
topk_ids=topk_ids,
)

if zero_expert_num > 0 and zero_expert_type is not None:
topk_ids, topk_weights, zero_expert_result = zero_experts_compute(
Expand Down
120 changes: 120 additions & 0 deletions vllm_ascend/ops/fused_moe/routed_experts_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
"""Compatibility shim around vLLM's RoutedExpertsCapturer.
- 0.20.2 exposed `RoutedExpertsCapturer.get_instance()` plus
`clear_buffer()` / `save_captured_experts(indices=...)` methods.
- main moved to module-level helpers (`get_global_experts_capturer`,
`issue_routing_d2h_copy`, `extract_routed_experts_for_current_batch`,
`free_routing_buffers`, `init_routed_experts_capturer_with_shared_cache`).
"""

from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np
import torch
from vllm.model_executor.layers.fused_moe import routed_experts_capturer as _rec

from vllm_ascend.utils import vllm_version_is

if TYPE_CHECKING:
from vllm.v1.core.sched.output import SchedulerOutput

USE_LEGACY_API = vllm_version_is("0.20.2")


def get_capturer():
"""Return the global capturer instance, or None if not initialized."""
if USE_LEGACY_API:
return _rec.RoutedExpertsCapturer.get_instance()
return _rec.get_global_experts_capturer()


def clear_step_buffers(scheduler_output: SchedulerOutput) -> None:
"""Free per-request routing buffers for finished/preempted reqs.

main: `free_routing_buffers(finished, preempted)`.
0.20.2: `capturer.clear_buffer()` (full-buffer reset).
"""
if USE_LEGACY_API:
capturer = get_capturer()
if capturer is not None:
capturer.clear_buffer()
return

_rec.free_routing_buffers(
scheduler_output.finished_req_ids,
getattr(scheduler_output, "preempted_req_ids", None),
)


def issue_d2h_copy(
*,
input_batch_req_ids: list[str],
num_scheduled_tokens: dict[str, int],
positions: torch.Tensor,
positions_cpu: torch.Tensor | None,
legacy_indices: torch.Tensor | None = None,
) -> None:
"""Trigger the per-step D2H copy of routed experts.

main: `issue_routing_d2h_copy(...)` (async copy).
0.20.2: `capturer.save_captured_experts(indices=legacy_indices)`.
"""
if USE_LEGACY_API:
capturer = get_capturer()
if capturer is not None:
capturer.save_captured_experts(indices=legacy_indices)
return

_rec.issue_routing_d2h_copy(
input_batch_req_ids=input_batch_req_ids,
num_scheduled_tokens=num_scheduled_tokens,
positions=positions,
positions_cpu=positions_cpu,
)


def extract_for_current_batch(
*,
req_ids: list[str],
requests: dict,
req_id_to_index: dict[str, int],
num_tokens_no_spec: np.ndarray,
max_model_len: int,
) -> dict[str, np.ndarray] | None:
"""Pull routing data for requests finishing this step.

main: `extract_routed_experts_for_current_batch(...)`.
0.20.2: routing data flows through a different channel inside
`save_captured_experts`, so this returns None.
"""
if USE_LEGACY_API:
return None
return _rec.extract_routed_experts_for_current_batch(
req_ids=req_ids,
requests=requests,
req_id_to_index=req_id_to_index,
num_tokens_no_spec=num_tokens_no_spec,
max_model_len=max_model_len,
)


def call_capture(capturer, *, layer_id: int, topk_ids: torch.Tensor) -> None:
"""Invoke `.capture(...)` on a capturer instance.

Both 0.20.2 and main expose `capture(layer_id, topk_ids)`, so this
is a thin pass-through kept for symmetry with the other helpers.
"""
if capturer is None:
return
capturer.capture(layer_id=layer_id, topk_ids=topk_ids)
Loading
Loading