Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
continue_on_error:
required: false
type: boolean
default: false
default: true
# The following inputs are used by comment-triggered E2E tests (/e2e <tests>).
# They carry space-separated pytest paths, categorized by runner type.
# Leave empty (default) when running label-triggered full/light suites.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [132765e3560659ff63ebd236203672e991b70e08]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: d886c26d4d4fef7d079696beb4ece1cfb4b008a8
vllm: 132765e3560659ff63ebd236203672e991b70e08
changes:
runs-on: linux-aarch64-a2b3-0
container:
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [132765e3560659ff63ebd236203672e991b70e08]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -164,7 +164,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [132765e3560659ff63ebd236203672e991b70e08]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
52 changes: 44 additions & 8 deletions .github/workflows/scripts/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ e2e-singlecard:
estimated_time: 222
- name: tests/e2e/singlecard/test_qwen3_multi_loras.py
estimated_time: 100
- name: tests/e2e/singlecard/test_models.py
estimated_time: 315
- name: tests/e2e/singlecard/test_models.py::test_minicpm
estimated_time: 158
- name: tests/e2e/singlecard/test_models.py::test_whisper
estimated_time: 157
is_skipped: true
- name: tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
estimated_time: 253
- name: tests/e2e/singlecard/test_quantization.py
Expand Down Expand Up @@ -110,6 +113,7 @@ e2e-multicard-2-cards:
estimated_time: 178
- name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_w4a8_accuracy_tp2
estimated_time: 127
is_skipped: true
- name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_moe_fc2_tp2
estimated_time: 149
- name: tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek_v2_lite_fc1_tp2
Expand All @@ -128,8 +132,17 @@ e2e-multicard-2-cards:
estimated_time: 400
- name: tests/e2e/multicard/2-cards/test_quantization.py
estimated_time: 482
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py
estimated_time: 974
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_mp_tp2_ep
estimated_time: 195
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2
estimated_time: 195
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_aiv_tp2
estimated_time: 195
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_distributed_tp2_ep2_mrv2
estimated_time: 195
is_skipped: true
- name: tests/e2e/multicard/2-cards/test_qwen3_moe.py::test_qwen3_moe_w8a8_distributed_tp2_ep_dynamic_eplb
estimated_time: 194
- name: tests/e2e/multicard/2-cards/test_qwen3_moe_routing_replay.py
estimated_time: 193
- name: tests/e2e/multicard/2-cards/test_single_request_aclgraph.py
Expand All @@ -149,12 +162,35 @@ e2e-multicard-4-cards:
estimated_time: 322
- name: tests/e2e/multicard/4-cards/test_kimi_k2.py
estimated_time: 37
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py
estimated_time: 1287
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_output_between_tp_and_cp
estimated_time: 257
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_graph
estimated_time: 257
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_dcp_only_eager
estimated_time: 257
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_accuracy_pcp_only
estimated_time: 257
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_accuracy.py::test_models_long_sequence_cp_kv_interleave_size_output_between_tp_and_cp
estimated_time: 259
- name: tests/e2e/multicard/4-cards/long_sequence/test_basic.py
estimated_time: 2179
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py
estimated_time: 1173
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_mixed_length_prompts_including_1_token
estimated_time: 235
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_empty_kvcache
estimated_time: 235
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_basic
estimated_time: 235
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_piecewise
estimated_time: 235
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_chunked_prefill_cp.py::test_models_chunked_prefill_with_cp_full_graph
estimated_time: 233
is_skipped: true
- name: tests/e2e/multicard/4-cards/long_sequence/test_prefix_caching_cp.py
estimated_time: 850
- name: tests/e2e/multicard/4-cards/long_sequence/test_mtp.py
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
# CANN image tag
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "d886c26d4d4fef7d079696beb4ece1cfb4b008a8",
"main_vllm_commit": "132765e3560659ff63ebd236203672e991b70e08",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The Pull Request title and summary do not adhere to the repository style guide. Please update them to follow the required format.

Suggested PR Title:

[Main2Main][Misc][Misc] Upgrade vLLM to 0.20.1

Suggested PR Summary:

### What this PR does / why we need it?
This PR upgrades the vLLM dependency to version 0.20.1. It includes necessary adaptations for Ascend-specific operators (FusedMoE, LayerNorm), worker logic, and speculative decoding components to maintain compatibility with the updated vLLM core.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Tested with existing unit tests and new tests for MoE logical experts and Eagle proposer.
References
  1. The PR Title and Summary must follow specific formats defined in the Repository Style Guide. (link)

# vLLM tag for main branch
"main_vllm_tag": "v0.19.1",
# Python version for main branch
Expand Down
9 changes: 4 additions & 5 deletions tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from vllm_ascend._310p.fused_moe.fused_moe import (
AscendFusedMoE310,
AscendSharedFusedMoE310,
)


Expand Down Expand Up @@ -48,8 +47,8 @@ def forward(self, hidden_states: torch.Tensor):
return out


def _build_layer(shared_experts: torch.nn.Module | None) -> AscendSharedFusedMoE310:
layer = AscendSharedFusedMoE310.__new__(AscendSharedFusedMoE310)
def _build_layer(shared_experts: torch.nn.Module | None) -> AscendFusedMoE310:
layer = AscendFusedMoE310.__new__(AscendFusedMoE310)
# The test bypasses full layer init with __new__, so we must initialize
# nn.Module internals before assigning child modules.
torch.nn.Module.__init__(layer)
Expand Down Expand Up @@ -80,7 +79,7 @@ def test_forward_impl_with_shared_experts_returns_tuple_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
shared_out, routed = layer.forward_impl(hidden_states, router_logits)
shared_out, routed = layer.shared_forward_impl(hidden_states, router_logits)

expected_shared = 0.5 * (hidden_states * 2.0 + 1.0)
torch.testing.assert_close(shared_out, expected_shared)
Expand All @@ -100,7 +99,7 @@ def test_forward_impl_without_shared_experts_returns_routed_only_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
output = layer.forward_impl(hidden_states, router_logits)
output = layer.shared_forward_impl(hidden_states, router_logits)

torch.testing.assert_close(output, routed_out)

Expand Down
6 changes: 6 additions & 0 deletions tests/ut/ops/test_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ def moe_method(mock_dist_env):
return AscendUnquantizedFusedMoEMethod(moe)


def test_ascend_unquantized_skips_upstream_modular_kernel_init():
method = AscendUnquantizedFusedMoEMethod.maybe_make_prepare_finalize

assert method(object()) is None


class Device(TypedDict):
device_id: int
device_expert: list[int]
Expand Down
41 changes: 33 additions & 8 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer
from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer
from vllm_ascend.utils import vllm_version_is

# vLLM #40732 moved `SpecDecodeBaseProposer` (and its `CpuGpuBuffer` import)
# out of `vllm.v1.spec_decode.eagle` into `vllm.v1.spec_decode.llm_base_proposer`.
# Pick the right patch path depending on the installed vllm version so the
# tests can mock the buffer factory.
_CPU_GPU_BUFFER_TARGET = (
"vllm.v1.spec_decode.eagle.CpuGpuBuffer"
if vllm_version_is("0.19.1")
else "vllm.v1.spec_decode.llm_base_proposer.CpuGpuBuffer"
)


class TestEagleProposerInitialization(TestBase):
Expand Down Expand Up @@ -51,13 +62,15 @@ def setUp(self):
self.vllm_config.parallel_config.enable_expert_parallel = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.parallel_drafting = False
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
self.vllm_config.additional_config = None

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand All @@ -76,6 +89,7 @@ def tearDown(self):
def test_initialization_eagle_graph(self):
self.vllm_config.speculative_config.method = "eagle"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
Expand All @@ -99,6 +113,7 @@ def test_initialization_eagle_graph(self):
def test_initialization_eagle3_enforce_eager(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.NONE
self.vllm_config.compilation_config.pass_config = MagicMock()
self.vllm_config.compilation_config.pass_config.enable_sp = False
Expand All @@ -116,6 +131,7 @@ def test_initialization_eagle3_enforce_eager(self):
def test_initialization_eagle3_full_graph_async(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand All @@ -133,6 +149,7 @@ def test_initialization_eagle3_full_graph_async(self):
def test_initialization_mtp_full_graph_async(self):
self.vllm_config.speculative_config.method = "mtp"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand Down Expand Up @@ -196,7 +213,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -332,7 +349,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -483,7 +500,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -558,7 +575,7 @@ def setUp_and_tearDown(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1263,7 +1280,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1747,6 +1764,7 @@ def setUp(self):
self.vllm_config.speculative_config.use_local_argmax_reduction = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(3)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
Expand All @@ -1755,7 +1773,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1876,7 +1894,14 @@ def check_mock(self):

import vllm.v1.spec_decode.eagle

assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
# `CpuGpuBuffer` was re-exported from `eagle` until vLLM #40732 moved
# `SpecDecodeBaseProposer` (and the import) into `llm_base_proposer`.
if vllm_version_is("0.19.1"):
assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
else:
import vllm.v1.spec_decode.llm_base_proposer

assert hasattr(vllm.v1.spec_decode.llm_base_proposer, "CpuGpuBuffer")
RunnerCls = vllm.v1.spec_decode.eagle.SpecDecodeBaseProposer
for attr in ("_get_positions", "_set_positions"):
assert hasattr(RunnerCls, attr), f"SpecDecodeBaseProposer.{attr} not found"
Expand Down
Loading
Loading