Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
edeeb6f
remove shared_fused_moe
wxsIcey Apr 29, 2026
67a1011
remove shared FusedMoE
wxsIcey Apr 30, 2026
0852239
main2main
shen-shanshan Apr 30, 2026
109bb18
fix lint
gcanlin May 1, 2026
c0102e5
test
gcanlin May 1, 2026
a3dda42
test
gcanlin May 1, 2026
9a42bbb
fix
gcanlin May 1, 2026
c8c5013
fix
gcanlin May 1, 2026
4499ed4
fix
gcanlin May 2, 2026
68baeeb
fix lint
gcanlin May 2, 2026
3e4592f
fix CpuGpuBuffer ut error
gcanlin May 2, 2026
7052dc3
[BugFix] Fix Ascend MoE routing expert count with EPLB
gcanlin May 2, 2026
d31d4c4
fix
gcanlin May 2, 2026
2f7e3ec
Merge branch 'moe-bugfix' into pr-8841
gcanlin May 2, 2026
ec00026
fix: #36286 breaks test_qwen3_ome_tp2_ep2_mrv2
gcanlin May 2, 2026
830e7df
fix ut
gcanlin May 2, 2026
dc7aa0b
Merge branch 'main' into pr-8841
gcanlin May 2, 2026
724a341
fix
gcanlin May 2, 2026
55059ee
Merge branch 'patch-fix' into pr-8841
gcanlin May 2, 2026
aa67fb0
fix(spec_decode): adapt EagleAclGraphManager.capture() for vLLM PR #4…
gcanlin May 2, 2026
28c4f69
fix(model_runner): add seq_lens_cpu_upper_bound to AscendInputBatch f…
gcanlin May 2, 2026
843da5c
fix(kv_cache): support hybrid KV cache groups + context parallelism o…
gcanlin May 2, 2026
6391bbb
revert to 4d51588e2
gcanlin May 2, 2026
6e8e7f9
revert 40410
gcanlin May 2, 2026
325e72e
remove 0.19.1 in CI tests
gcanlin May 2, 2026
2d208fa
remove chinese comments
gcanlin May 2, 2026
cb70a85
fix conf.py
gcanlin May 2, 2026
60424fc
fix 310p shared fused moe
gcanlin May 2, 2026
f5523b9
fix base_quant_method bug
gcanlin May 2, 2026
a0ae6cd
patch kv_utils
gcanlin May 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ on:
continue_on_error:
required: false
type: boolean
default: false
default: true
# The following inputs are used by comment-triggered E2E tests (/e2e <tests>).
# They carry space-separated pytest paths, categorized by runner type.
# Leave empty (default) when running label-triggered full/light suites.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pr_test_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ jobs:
name: e2e-full
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
needs: [changes]
if: ${{ needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.e2e_tracker == true }}
uses: ./.github/workflows/_e2e_test.yaml
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/pr_test_light.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
lint:
uses: ./.github/workflows/_pre_commit.yml
with:
vllm: d886c26d4d4fef7d079696beb4ece1cfb4b008a8
vllm: 4d51588e2381018348f1022dfa3a7698899805b7
changes:
runs-on: linux-aarch64-a2b3-0
container:
Expand Down Expand Up @@ -154,7 +154,7 @@ jobs:
if: ${{ needs.lint.result == 'success' && needs.changes.outputs.has_tests == 'true' }}
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
uses: ./.github/workflows/_optional_smart_e2e.yaml
with:
vllm: ${{ matrix.vllm_version }}
Expand All @@ -164,7 +164,7 @@ jobs:
name: e2e-light
strategy:
matrix:
vllm_version: [d886c26d4d4fef7d079696beb4ece1cfb4b008a8, v0.19.1]
vllm_version: [4d51588e2381018348f1022dfa3a7698899805b7]
# Note (yikun): If CI resource are limited we can split job into two chain jobs
needs: [lint, changes]
# only trigger e2e test after lint passed and the change is e2e related with pull request.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
# CANN image tag
"cann_image_tag": "8.5.1-910b-ubuntu22.04-py3.11",
# vLLM commit hash for main branch
"main_vllm_commit": "d886c26d4d4fef7d079696beb4ece1cfb4b008a8",
"main_vllm_commit": "4d51588e2381018348f1022dfa3a7698899805b7",
# vLLM tag for main branch
"main_vllm_tag": "v0.19.1",
# Python version for main branch
Expand Down
9 changes: 4 additions & 5 deletions tests/ut/_310p/fused_moe/test_shared_fused_moe_310.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from vllm_ascend._310p.fused_moe.fused_moe import (
AscendFusedMoE310,
AscendSharedFusedMoE310,
)


Expand Down Expand Up @@ -48,8 +47,8 @@ def forward(self, hidden_states: torch.Tensor):
return out


def _build_layer(shared_experts: torch.nn.Module | None) -> AscendSharedFusedMoE310:
layer = AscendSharedFusedMoE310.__new__(AscendSharedFusedMoE310)
def _build_layer(shared_experts: torch.nn.Module | None) -> AscendFusedMoE310:
layer = AscendFusedMoE310.__new__(AscendFusedMoE310)
# The test bypasses full layer init with __new__, so we must initialize
# nn.Module internals before assigning child modules.
torch.nn.Module.__init__(layer)
Expand Down Expand Up @@ -80,7 +79,7 @@ def test_forward_impl_with_shared_experts_returns_tuple_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
shared_out, routed = layer.forward_impl(hidden_states, router_logits)
shared_out, routed = layer.shared_forward_impl(hidden_states, router_logits)

expected_shared = 0.5 * (hidden_states * 2.0 + 1.0)
torch.testing.assert_close(shared_out, expected_shared)
Expand All @@ -100,7 +99,7 @@ def test_forward_impl_without_shared_experts_returns_routed_only_310():
routed_out = torch.randn(3, 8)

with patch.object(AscendFusedMoE310, "forward_impl", return_value=routed_out):
output = layer.forward_impl(hidden_states, router_logits)
output = layer.shared_forward_impl(hidden_states, router_logits)

torch.testing.assert_close(output, routed_out)

Expand Down
6 changes: 6 additions & 0 deletions tests/ut/ops/test_fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ def moe_method(mock_dist_env):
return AscendUnquantizedFusedMoEMethod(moe)


def test_ascend_unquantized_skips_upstream_modular_kernel_init():
method = AscendUnquantizedFusedMoEMethod.maybe_make_prepare_finalize

assert method(object()) is None


class Device(TypedDict):
device_id: int
device_expert: list[int]
Expand Down
23 changes: 23 additions & 0 deletions tests/ut/quantization/methods/test_moe_logical_experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from types import SimpleNamespace

from vllm_ascend.quantization.methods.base import get_moe_num_logical_experts


def test_get_moe_num_logical_experts_uses_vllm_config_field():
layer = SimpleNamespace(moe_config=SimpleNamespace(num_logical_experts=128))

assert get_moe_num_logical_experts(layer, num_experts=130, global_redundant_expert_num=2) == 128


def test_get_moe_num_logical_experts_falls_back_for_older_configs():
layer = SimpleNamespace(moe_config=SimpleNamespace())

assert (
get_moe_num_logical_experts(
layer,
num_experts=133,
global_redundant_expert_num=2,
num_shared_experts=3,
)
== 128
)
41 changes: 33 additions & 8 deletions tests/ut/spec_decode/test_eagle_proposer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
from vllm_ascend.attention.attention_v1 import AscendAttentionState
from vllm_ascend.spec_decode.draft_proposer import AscendDraftModelProposer
from vllm_ascend.spec_decode.eagle_proposer import AscendEagleProposer
from vllm_ascend.utils import vllm_version_is

# vLLM #40732 moved `SpecDecodeBaseProposer` (and its `CpuGpuBuffer` import)
# out of `vllm.v1.spec_decode.eagle` into `vllm.v1.spec_decode.llm_base_proposer`.
# Pick the right patch path depending on the installed vllm version so the
# tests can mock the buffer factory.
_CPU_GPU_BUFFER_TARGET = (
"vllm.v1.spec_decode.eagle.CpuGpuBuffer"
if vllm_version_is("0.19.1")
else "vllm.v1.spec_decode.llm_base_proposer.CpuGpuBuffer"
)


class TestEagleProposerInitialization(TestBase):
Expand Down Expand Up @@ -51,13 +62,15 @@ def setUp(self):
self.vllm_config.parallel_config.enable_expert_parallel = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.num_speculative_tokens = 2
self.vllm_config.speculative_config.parallel_drafting = False
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(2)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.uses_xdrope_dim = 0
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.speculative_config.disable_padded_drafter_batch = False
self.vllm_config.additional_config = None

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand All @@ -76,6 +89,7 @@ def tearDown(self):
def test_initialization_eagle_graph(self):
self.vllm_config.speculative_config.method = "eagle"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4096
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
Expand All @@ -99,6 +113,7 @@ def test_initialization_eagle_graph(self):
def test_initialization_eagle3_enforce_eager(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.NONE
self.vllm_config.compilation_config.pass_config = MagicMock()
self.vllm_config.compilation_config.pass_config.enable_sp = False
Expand All @@ -116,6 +131,7 @@ def test_initialization_eagle3_enforce_eager(self):
def test_initialization_eagle3_full_graph_async(self):
self.vllm_config.speculative_config.method = "eagle3"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand All @@ -133,6 +149,7 @@ def test_initialization_eagle3_full_graph_async(self):
def test_initialization_mtp_full_graph_async(self):
self.vllm_config.speculative_config.method = "mtp"
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 2048
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 2048
self.vllm_config.compilation_config.mode = CompilationMode.VLLM_COMPILE
self.vllm_config.model_config.enforce_eager = False
self.vllm_config.speculative_config.enforce_eager = False
Expand Down Expand Up @@ -196,7 +213,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -332,7 +349,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -483,7 +500,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -558,7 +575,7 @@ def setUp_and_tearDown(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer")
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1263,7 +1280,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1747,6 +1764,7 @@ def setUp(self):
self.vllm_config.speculative_config.use_local_argmax_reduction = False
self.vllm_config.speculative_config.draft_tensor_parallel_size = 1
self.vllm_config.speculative_config.speculative_token_tree = str([(i + 1) * (0,) for i in range(3)])
self.vllm_config.speculative_config.draft_model_config.hf_config = MagicMock(spec=[])
self.vllm_config.speculative_config.draft_model_config.get_hidden_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.get_inputs_embeds_size.return_value = 4
self.vllm_config.speculative_config.draft_model_config.uses_mrope = False
Expand All @@ -1755,7 +1773,7 @@ def setUp(self):
self.vllm_config.additional_config = None
init_ascend_config(self.vllm_config)

self.mock_cpugpubuffer = patch("vllm.v1.spec_decode.eagle.CpuGpuBuffer", MockCpuGpuBuffer)
self.mock_cpugpubuffer = patch(_CPU_GPU_BUFFER_TARGET, MockCpuGpuBuffer)
self.mock_cpugpubuffer.start()
self.mock_supports_multimodal_inputs = patch(
"vllm.multimodal.registry.MultiModalRegistry.supports_multimodal_inputs", return_value=False
Expand Down Expand Up @@ -1876,7 +1894,14 @@ def check_mock(self):

import vllm.v1.spec_decode.eagle

assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
# `CpuGpuBuffer` was re-exported from `eagle` until vLLM #40732 moved
# `SpecDecodeBaseProposer` (and the import) into `llm_base_proposer`.
if vllm_version_is("0.19.1"):
assert hasattr(vllm.v1.spec_decode.eagle, "CpuGpuBuffer")
else:
import vllm.v1.spec_decode.llm_base_proposer

assert hasattr(vllm.v1.spec_decode.llm_base_proposer, "CpuGpuBuffer")
RunnerCls = vllm.v1.spec_decode.eagle.SpecDecodeBaseProposer
for attr in ("_get_positions", "_set_positions"):
assert hasattr(RunnerCls, attr), f"SpecDecodeBaseProposer.{attr} not found"
Expand Down
76 changes: 17 additions & 59 deletions vllm_ascend/_310p/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from vllm.distributed import get_dp_group, get_ep_group, get_tp_group
from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod
from vllm.model_executor.layers.fused_moe.shared_fused_moe import SharedFusedMoE

from vllm_ascend.ascend_forward_context import _EXTRA_CTX, MoECommType
from vllm_ascend.ops.fused_moe.experts_selector import zero_experts_compute
Expand All @@ -41,6 +40,11 @@ def __init__(self, moe: FusedMoEConfig = None):
def is_monolithic(self) -> bool:
return False

def maybe_make_prepare_finalize(self, routing_tables=None):
# Ascend 310P uses its own MoE communication and forward_impl path.
# Do not let upstream modular-kernel initialization replace it.
return None

def process_weights_after_loading(self, layer):
super().process_weights_after_loading(layer)

Expand Down Expand Up @@ -120,6 +124,8 @@ class AscendFusedMoE310(FusedMoE):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self._routed_input_transform = kwargs.get("routed_input_transform")
self._shared_experts = kwargs.get("shared_experts")
self.global_num_experts = kwargs["num_experts"]

if self.quant_config is None:
Expand All @@ -128,6 +134,10 @@ def __init__(self, *args, **kwargs):
self.quant_method = self.quant_config.get_quant_method(self, self.layer_name)

assert self.quant_method is not None
# Keep base_quant_method aligned with the Ascend-replaced quant_method
# so FusedMoE.maybe_init_modular_kernel doesn't dispatch into the
# upstream UnquantizedFusedMoEMethod.maybe_make_prepare_finalize.
self.base_quant_method = self.quant_method

self.moe_config.tp_group = get_tp_group()
self.moe_config.dp_group = get_dp_group()
Expand Down Expand Up @@ -177,6 +187,11 @@ def __init__(self, *args, **kwargs):
self.vllm_config.parallel_config.enable_dbo,
)

@property
def is_internal_router(self) -> bool:
# 310P Ascend path expects router logits from the model forward path.
return False

def init_experts_map(self, moe_config):
"""
Initialize expert mapping for MoE (Mixture of Experts) model.
Expand Down Expand Up @@ -262,69 +277,12 @@ def forward_impl( # type: ignore[override]

return routed_out


class AscendSharedFusedMoE310(SharedFusedMoE, AscendFusedMoE310):
def __init__(
self,
shared_experts: torch.nn.Module,
gate: torch.nn.Module | None = None,
use_overlapped: bool = True,
routed_input_transform: torch.nn.Module | None = None,
**kwargs,
):
AscendFusedMoE310.__init__(self, **kwargs)
self._routed_input_transform = routed_input_transform
self._shared_experts = shared_experts
self.use_overlapped = use_overlapped
self.shared_expert_stream = None
self._gate = gate
# Recreate runner after shared_experts/gate are set so custom op dispatch
# goes through moe_forward_shared.
# NOTE: must use self._shared_experts here, not self.shared_experts —
# FusedMoE.shared_experts is a property that reads self.runner.shared_experts,
# which at this point is still the stale runner built with shared_experts=None.
from vllm_ascend.ops.fused_moe.fused_moe import AscendMoERunner

is_legacy = vllm_version_is("0.19.1")
self.runner = AscendMoERunner(
self if is_legacy else self.layer_name,
self.moe_config,
self.router,
self._routed_input_transform,
self._gate,
self._shared_experts,
self.quant_method,
self.reduce_results,
self.vllm_config.parallel_config.enable_dbo,
)

@property
def is_internal_router(self) -> bool:
# 310P Ascend path expects router logits from the model forward path.
return False

def forward(
self,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
result = AscendFusedMoE310.forward(
self,
hidden_states=hidden_states,
router_logits=router_logits,
)
# When shared experts are absent, the parent returns only fused_out;
# otherwise it returns a (shared_out, fused_out) tuple.
if self._shared_experts is None:
return None, result
return result

def _forward_shared_experts(self, hidden_states: torch.Tensor):
if self._shared_experts is None:
return None
return self._shared_experts(hidden_states)

def forward_impl( # type: ignore[override]
def shared_forward_impl( # type: ignore[override]
self, hidden_states: torch.Tensor, router_logits: torch.Tensor
):
routed_out = AscendFusedMoE310.forward_impl(
Expand Down
Loading
Loading