Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
70567d7
remove shared_fused_moe
wxsIcey Apr 29, 2026
1c55710
remove shared FusedMoE
wxsIcey Apr 30, 2026
01382c2
main2main
shen-shanshan Apr 30, 2026
db3f6a9
fix lint
gcanlin May 1, 2026
1bf04dc
test
gcanlin May 1, 2026
f9f7a50
test
gcanlin May 1, 2026
487e968
fix
gcanlin May 1, 2026
435473d
fix
gcanlin May 1, 2026
497d720
fix
gcanlin May 2, 2026
7779044
fix lint
gcanlin May 2, 2026
48c868c
fix CpuGpuBuffer ut error
gcanlin May 2, 2026
cf4b9a9
[BugFix] Fix Ascend MoE routing expert count with EPLB
gcanlin May 2, 2026
24a0ee3
fix
gcanlin May 2, 2026
13065cd
fix: #36286 breaks test_qwen3_ome_tp2_ep2_mrv2
gcanlin May 2, 2026
1432105
fix ut
gcanlin May 2, 2026
39dca23
fix(spec_decode): adapt EagleAclGraphManager.capture() for vLLM PR #4…
gcanlin May 2, 2026
eed1df5
fix(model_runner): add seq_lens_cpu_upper_bound to AscendInputBatch f…
gcanlin May 2, 2026
cf5afec
fix(kv_cache): support hybrid KV cache groups + context parallelism o…
gcanlin May 2, 2026
5da94b2
revert to 4d51588e2
gcanlin May 2, 2026
dfb11e2
revert 40410
gcanlin May 2, 2026
11dac22
remove 0.19.1 in CI tests
gcanlin May 2, 2026
3bd8aa0
remove chinese comments
gcanlin May 2, 2026
0e0b8dd
fix conf.py
gcanlin May 2, 2026
29c6bf1
fix 310p shared fused moe
gcanlin May 2, 2026
3217c26
fix base_quant_method bug
gcanlin May 2, 2026
a8ca2de
patch kv_utils
gcanlin May 3, 2026
ed5fb9d
fix seq_lens_cpu_upper_bound
shen-shanshan May 6, 2026
41b57b0
update
shen-shanshan May 6, 2026
43c952f
fix moe
shen-shanshan May 6, 2026
d60381a
remove 0.19.1 wrapper
shen-shanshan May 6, 2026
d21e2c7
update commit
shen-shanshan May 7, 2026
760b885
update commit
shen-shanshan May 7, 2026
1a7f295
update commit
shen-shanshan May 7, 2026
b5ee4f3
fix qwen3-moe acc
shen-shanshan May 7, 2026
9c9f45d
update commit
shen-shanshan May 7, 2026
70ffd0d
regtriger ci
shen-shanshan May 8, 2026
a20363a
update
shen-shanshan May 8, 2026
8311d1f
disable continue_on_error
shen-shanshan May 8, 2026
64649fe
update
shen-shanshan May 8, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: d886c26d4d4fef7d079696beb4ece1cfb4b008a8
vllm: 4d51588e2381018348f1022dfa3a7698899805b7
changes:
runs-on: linux-aarch64-a2b3-0
container:
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -164,7 +164,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ kernel_meta/
# generated by CANN
fusion_result.json
csrc/output/
csrc/third_party/

# claude code skills
.claude/skills/*
2 changes: 1 addition & 1 deletion csrc/third_party/catlass
Submodule catlass updated from b50cad to 716fd7
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
# CANN image tag
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "d886c26d4d4fef7d079696beb4ece1cfb4b008a8",
"main_vllm_commit": "4d51588e2381018348f1022dfa3a7698899805b7",
# vLLM tag for main branch
"main_vllm_tag": "v0.19.1",
# Python version for main branch
Expand Down
9 changes: 4 additions & 5 deletions tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from vllm_ascend._310p.fused_moe.fused_moe import (
AscendFusedMoE310,
AscendSharedFusedMoE310,
)


Expand Down Expand Up @@ -48,8 +47,8 @@ def forward(self, hidden_states: torch.Tensor):
return out


def _build_layer(shared_experts: torch.nn.Module | None) -> AscendSharedFusedMoE310:
layer = AscendSharedFusedMoE310.__new__(AscendSharedFusedMoE310)
def _build_layer(shared_experts: torch.nn.Module | None) -> AscendFusedMoE310:
layer = AscendFusedMoE310.__new__(AscendFusedMoE310)
# The test bypasses full layer init with __new__, so we must initialize
# nn.Module internals before assigning child modules.
torch.nn.Module.__init__(layer)
Expand Down Expand Up @@ -80,7 +79,7 @@ def test_forward_impl_with_shared_experts_returns_tuple_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
shared_out, routed = layer.forward_impl(hidden_states, router_logits)
shared_out, routed = layer.shared_forward_impl(hidden_states, router_logits)

expected_shared = 0.5 * (hidden_states * 2.0 + 1.0)
torch.testing.assert_close(shared_out, expected_shared)
Expand All @@ -100,7 +99,7 @@ def test_forward_impl_without_shared_experts_returns_routed_only_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
output = layer.forward_impl(hidden_states, router_logits)
output = layer.shared_forward_impl(hidden_states, router_logits)

torch.testing.assert_close(output, routed_out)

Expand Down
6 changes: 6 additions & 0 deletions tests/ut/ops/test_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ def moe_method(mock_dist_env):
return AscendUnquantizedFusedMoEMethod(moe)


def test_ascend_unquantized_skips_upstream_modular_kernel_init():
method = AscendUnquantizedFusedMoEMethod.maybe_make_prepare_finalize

assert method(object()) is None


class Device(TypedDict):
device_id: int
device_expert: list[int]
Expand Down
41 changes: 33 additions & 8 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer
from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer
from vllm_ascend.utils import vllm_version_is

# vLLM #40732 moved `SpecDecodeBaseProposer` (and its `CpuGpuBuffer` import)
# out of `vllm.v1.spec_decode.eagle` into `vllm.v1.spec_decode.llm_base_proposer`.
# Pick the right patch path depending on the installed vllm version so the
# tests can mock the buffer factory.
_CPU_GPU_BUFFER_TARGET = (
"vllm.v1.spec_decode.eagle.CpuGpuBuffer"
if vllm_version_is("0.19.1")
else "vllm.v1.spec_decode.llm_base_proposer.CpuGpuBuffer"
)


class TestEagleProposerInitialization(TestBase):
Expand Down Expand Up @@ -51,13 +62,15 @@ def setUp(self):
self.vllm_config.parallel_config.enable_expert_parallel = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.parallel_drafting = False
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
self.vllm_config.additional_config = None

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand All @@ -76,6 +89,7 @@ def tearDown(self):
def test_initialization_eagle_graph(self):
self.vllm_config.speculative_config.method = "eagle"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
Expand All @@ -99,6 +113,7 @@ def test_initialization_eagle_graph(self):
def test_initialization_eagle3_enforce_eager(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.NONE
self.vllm_config.compilation_config.pass_config = MagicMock()
self.vllm_config.compilation_config.pass_config.enable_sp = False
Expand All @@ -116,6 +131,7 @@ def test_initialization_eagle3_enforce_eager(self):
def test_initialization_eagle3_full_graph_async(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand All @@ -133,6 +149,7 @@ def test_initialization_eagle3_full_graph_async(self):
def test_initialization_mtp_full_graph_async(self):
self.vllm_config.speculative_config.method = "mtp"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand Down Expand Up @@ -196,7 +213,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -332,7 +349,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -483,7 +500,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -558,7 +575,7 @@ def setUp_and_tearDown(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1263,7 +1280,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1747,6 +1764,7 @@ def setUp(self):
self.vllm_config.speculative_config.use_local_argmax_reduction = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(3)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
Expand All @@ -1755,7 +1773,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1876,7 +1894,14 @@ def check_mock(self):

import vllm.v1.spec_decode.eagle

assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
# `CpuGpuBuffer` was re-exported from `eagle` until vLLM #40732 moved
# `SpecDecodeBaseProposer` (and the import) into `llm_base_proposer`.
if vllm_version_is("0.19.1"):
assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
else:
import vllm.v1.spec_decode.llm_base_proposer

assert hasattr(vllm.v1.spec_decode.llm_base_proposer, "CpuGpuBuffer")
RunnerCls = vllm.v1.spec_decode.eagle.SpecDecodeBaseProposer
for attr in ("_get_positions", "_set_positions"):
assert hasattr(RunnerCls, attr), f"SpecDecodeBaseProposer.{attr} not found"
Expand Down
75 changes: 17 additions & 58 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE

from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
Expand All @@ -40,6 +39,11 @@ def __init__(self, moe: FusedMoEConfig = None):
def is_monolithic(self) -> bool:
return False

def maybe_make_prepare_finalize(self, routing_tables=None):
# Ascend 310P uses its own MoE communication and forward_impl path.
# Do not let upstream modular-kernel initialization replace it.
return None

def process_weights_after_loading(self, layer):
super().process_weights_after_loading(layer)

Expand Down Expand Up @@ -119,6 +123,8 @@ class AscendFusedMoE310(FusedMoE):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self._routed_input_transform = kwargs.get("routed_input_transform")
self._shared_experts = kwargs.get("shared_experts")
self.global_num_experts = kwargs["num_experts"]

if self.quant_config is None:
Expand All @@ -127,6 +133,10 @@ def __init__(self, *args, **kwargs):
self.quant_method = self.quant_config.get_quant_method(self, self.layer_name)

assert self.quant_method is not None
# Keep base_quant_method aligned with the Ascend-replaced quant_method
# so FusedMoE.maybe_init_modular_kernel doesn't dispatch into the
# upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize.
self.base_quant_method = self.quant_method

self.moe_config.tp_group = get_tp_group()
self.moe_config.dp_group = get_dp_group()
Expand Down Expand Up @@ -175,6 +185,11 @@ def __init__(self, *args, **kwargs):
self.vllm_config.parallel_config.enable_dbo,
)

@property
def is_internal_router(self) -> bool:
# 310P Ascend path expects router logits from the model forward path.
return False

def init_experts_map(self, moe_config):
"""
Initialize expert mapping for MoE (Mixture of Experts) model.
Expand Down Expand Up @@ -260,68 +275,12 @@ def forward_impl( # type: ignore[override]

return routed_out


class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310):
def __init__(
self,
shared_experts: torch.nn.Module,
gate: torch.nn.Module | None = None,
use_overlapped: bool = True,
routed_input_transform: torch.nn.Module | None = None,
**kwargs,
):
AscendFusedMoE310.__init__(self, **kwargs)
self._routed_input_transform = routed_input_transform
self._shared_experts = shared_experts
self.use_overlapped = use_overlapped
self.shared_expert_stream = None
self._gate = gate
# Recreate runner after shared_experts/gate are set so custom op dispatch
# goes through moe_forward_shared.
# NOTE: must use self._shared_experts here, not self.shared_experts —
# FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
# which at this point is still the stale runner built with shared_experts=None.
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

self.runner = AscendMoERunner(
self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
self._gate,
self._shared_experts,
self.quant_method,
self.reduce_results,
self.vllm_config.parallel_config.enable_dbo,
)

@property
def is_internal_router(self) -> bool:
# 310P Ascend path expects router logits from the model forward path.
return False

def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
result = AscendFusedMoE310.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
# When shared experts are absent, the parent returns only fused_out;
# otherwise it returns a (shared_out, fused_out) tuple.
if self._shared_experts is None:
return None, result
return result

def _forward_shared_experts(self, hidden_states: torch.Tensor):
if self._shared_experts is None:
return None
return self._shared_experts(hidden_states)

def forward_impl( # type: ignore[override]
def shared_forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor
):
routed_out = AscendFusedMoE310.forward_impl(
Expand Down
Loading
Loading