Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
120384c
initial MoERunner refactor
bnellnm Jan 13, 2026
8deab88
fix lint
bnellnm Feb 12, 2026
42aea01
rebase
bnellnm Feb 24, 2026
82c8ae2
fix broken test
bnellnm Feb 24, 2026
09f13a4
wip
bnellnm Feb 4, 2026
a137855
fix
bnellnm Feb 9, 2026
fdd0e43
WIP DOUBLE CHECK THIS
bnellnm Feb 11, 2026
d113427
wip more refactoring
bnellnm Feb 19, 2026
d9a92c4
wip
bnellnm Feb 19, 2026
be300b0
SharedExperts wip
bnellnm Feb 23, 2026
22ed6e8
cleanups
bnellnm Feb 23, 2026
603b818
fix circular import
bnellnm Feb 23, 2026
8ce8615
add back default arg
bnellnm Feb 24, 2026
cec1dba
fixes
bnellnm Feb 24, 2026
2409a93
renames
bnellnm Feb 24, 2026
a2510e2
add comment
bnellnm Feb 24, 2026
a1d758f
more renames
bnellnm Feb 24, 2026
26facf5
cleanup
bnellnm Feb 25, 2026
5d5debd
use workspace for DP chunk slices
bnellnm Feb 25, 2026
93002ef
split up default runner
bnellnm Feb 25, 2026
0399170
update comments
bnellnm Feb 25, 2026
8f710b8
attempt to fix zero experts
bnellnm Feb 26, 2026
f219666
simplify ZeroExpertFusedMoE and add ZeroExpertRouter
bnellnm Feb 27, 2026
cc207df
add value test
bnellnm Feb 27, 2026
6b88bb0
move ZeroExpertRouter construction into router factory
bnellnm Feb 27, 2026
4a0e70f
move zero expert handling into MoERunnerBase
bnellnm Feb 27, 2026
c65b099
slightly improved test
bnellnm Feb 27, 2026
b36c8c4
simplifications
bnellnm Feb 27, 2026
786206e
better test
bnellnm Feb 27, 2026
28e9058
remove ZeroExpertFusedMoE
bnellnm Feb 27, 2026
6061dec
turn ChunkingMoERunner into a wrapper around MoERunner
bnellnm Feb 27, 2026
8cc5bd3
add some asserts
bnellnm Feb 27, 2026
25e27f7
remove assert
bnellnm Feb 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
model_name: "meituan-longcat/LongCat-Flash-Chat-FP8"
accuracy_threshold: 0.70
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 8
--enable-expert-parallel
3 changes: 2 additions & 1 deletion tests/evals/gsm8k/configs/moe-refactor/config-h100.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
Llama-4-Scout-Fp8-ModelOpt-triton.yaml
Qwen3-30B-A3B-BF16-fi-cutlass.yaml
Qwen3-30B-A3B-BF16-triton.yaml
Qwen3-30B-A3B-BF16-triton.yaml
LongCat-Flash-Chat-FP8.yaml
4 changes: 2 additions & 2 deletions tests/kernels/moe/modular_kernel_tools/mk_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,14 +253,14 @@ def expert_info(kind) -> ExpertInfo:

if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import ( # noqa: E501
FlashInferCutlassMoEPrepareAndFinalize,
FlashInferA2APrepareAndFinalize,
)
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts,
)

register_prepare_and_finalize(
FlashInferCutlassMoEPrepareAndFinalize,
FlashInferA2APrepareAndFinalize,
standard_format,
nvfp4_types + fp8_types,
blocked_quantization_support=True,
Expand Down
280 changes: 280 additions & 0 deletions tests/kernels/moe/test_zero_expert_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for FusedMoE with zero experts.

Verifies that:
- The ZeroExpertRouter is properly created and used as the layer router.
- A forward pass through FusedMoE with zero experts produces correct output.
- The output decomposes correctly into real expert + zero expert contributions.
"""

import pytest
import torch

from vllm.config import VllmConfig, set_current_vllm_config
from vllm.forward_context import get_forward_context, set_forward_context
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
from vllm.model_executor.layers.fused_moe.router.zero_expert_router import (
ZeroExpertRouter,
)
from vllm.v1.worker.workspace import init_workspace_manager


@pytest.fixture
def zero_expert_moe(dist_init, default_vllm_config):
"""Create a FusedMoE layer with zero experts."""
num_experts = 4
top_k = 2
# hidden_size must be >= 256 for the zero expert identity kernel to
# produce output (its BLOCK_SIZE=256 causes grid=0 when hidden_dim<256).
hidden_size = 256
intermediate_size = 512
zero_expert_num = 1

e_score_correction_bias = torch.zeros(
num_experts + zero_expert_num,
dtype=torch.float32,
device="cuda",
)

vllm_config = VllmConfig()
vllm_config.compilation_config.static_forward_context = dict()

with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
init_workspace_manager(torch.cuda.current_device())

layer = FusedMoE(
zero_expert_type="identity",
e_score_correction_bias=e_score_correction_bias,
num_experts=num_experts,
top_k=top_k,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
params_dtype=torch.bfloat16,
prefix="test_zero_expert_moe",
renormalize=False,
routed_scaling_factor=1.0,
scoring_func="softmax",
).cuda()

layer.quant_method.process_weights_after_loading(layer)

yield layer, vllm_config


@pytest.mark.parametrize("num_tokens", [1, 32])
def test_zero_expert_moe_router_is_zero_expert_router(zero_expert_moe, num_tokens):
"""Verify that FusedMoE with zero_expert_type creates a ZeroExpertRouter."""
layer, _ = zero_expert_moe
assert isinstance(layer.router, ZeroExpertRouter), (
f"Expected ZeroExpertRouter but got {type(layer.router).__name__}."
)


@pytest.mark.parametrize("num_tokens", [1, 32])
def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens):
"""Verify that custom_routing_function is not set (routing is handled
by ZeroExpertRouter, not a memoizing closure)."""
layer, _ = zero_expert_moe
assert layer.custom_routing_function is None


@pytest.mark.parametrize("num_tokens", [1, 32])
def test_zero_expert_moe_forward(zero_expert_moe, num_tokens):
"""Run a forward pass through FusedMoE with zero experts and verify output shape."""
layer, vllm_config = zero_expert_moe

hidden_size = layer.hidden_size
num_experts = 4
zero_expert_num = 1
total_experts = num_experts + zero_expert_num

hidden_states = torch.randn(
num_tokens, hidden_size, dtype=torch.bfloat16, device="cuda"
)
router_logits = torch.randn(
num_tokens, total_experts, dtype=torch.float32, device="cuda"
)

# Initialize weights to small random values to avoid NaN from
# uninitialized memory.
with torch.no_grad():
for param in layer.parameters():
if param.dtype.is_floating_point:
param.normal_(0, 0.01)

with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
get_forward_context().all_moe_layers = None
output = layer.forward(hidden_states, router_logits)

assert output.shape == hidden_states.shape, (
f"Expected output shape {hidden_states.shape}, got {output.shape}"
)
assert output.dtype == hidden_states.dtype
assert not torch.isnan(output).any(), "Output contains NaN values"


@pytest.mark.parametrize("num_tokens", [1, 32])
def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens):
"""Validate that the FusedMoE output equals a plain FusedMoE
output (real experts only) plus the zero expert contribution.

The key invariant is:
zero_layer.forward(h, r_full) == plain_layer.forward(h, r_real)
+ zero_expert_output

We create a plain FusedMoE layer with the same weights and real-expert-only
router logits, compute the zero expert output via the ZeroExpertRouter, and
verify the sum matches the FusedMoE output.
"""
layer, vllm_config = zero_expert_moe
num_experts = 4
zero_expert_num = 1
total_experts = num_experts + zero_expert_num

hidden_states = torch.randn(
num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda"
)
router_logits = torch.randn(
num_tokens, total_experts, dtype=torch.float32, device="cuda"
)

with torch.no_grad():
for param in layer.parameters():
if param.dtype.is_floating_point:
param.normal_(0, 0.01)

with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
get_forward_context().all_moe_layers = None

# Create a plain FusedMoE layer with the same config but no zero
# experts. Use a separate prefix to avoid collision.
plain_layer = FusedMoE(
num_experts=num_experts,
top_k=layer.top_k,
hidden_size=layer.hidden_size,
intermediate_size=layer.intermediate_size_per_partition,
params_dtype=torch.bfloat16,
prefix="test_zero_expert_moe_plain",
renormalize=False,
scoring_func="softmax",
e_score_correction_bias=layer.e_score_correction_bias,
).cuda()

# Share weights from the zero expert layer.
plain_layer.w13_weight.data.copy_(layer.w13_weight.data)
plain_layer.w2_weight.data.copy_(layer.w2_weight.data)
plain_layer.quant_method.process_weights_after_loading(plain_layer)

# Compute routing via the ZeroExpertRouter. This produces masked
# topk_weights/topk_ids (zero expert entries have weight=0, id=0)
# and stores zero_expert_output as a side effect.
topk_weights, topk_ids = layer.router.select_experts(
hidden_states, router_logits
)
zero_output = layer.router.zero_expert_output

# Compute real expert output using the plain layer with the masked
# routing from the ZeroExpertRouter.
real_output = plain_layer.quant_method.apply(
layer=plain_layer,
x=hidden_states,
topk_weights=topk_weights,
topk_ids=topk_ids,
shared_experts_input=None,
)

# Get the combined output from the zero expert layer.
full_output = layer.forward(hidden_states, router_logits)

assert zero_output is not None, "Zero expert output should not be None"
assert not torch.isnan(real_output).any(), "Real expert output has NaN"
assert not torch.isnan(zero_output).any(), "Zero expert output has NaN"
assert not torch.isnan(full_output).any(), "Full output has NaN"

expected = real_output + zero_output
torch.testing.assert_close(
full_output,
expected,
atol=0,
rtol=0,
msg="FusedMoE output should equal plain FusedMoE output "
"plus zero expert contribution",
)


@pytest.mark.parametrize("num_tokens", [1, 32])
def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens):
"""Validate zero expert identity behavior.

When routing strongly favors the zero expert, its contribution should
be a scaled version of hidden_states (identity operation). We verify
this by manually computing the expected zero expert output from the
routing weights and comparing against what the router produces.
"""
layer, vllm_config = zero_expert_moe
num_experts = 4
zero_expert_num = 1
total_experts = num_experts + zero_expert_num

hidden_states = torch.randn(
num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda"
)
# Strongly bias toward the zero expert (index 4).
router_logits = torch.full(
(num_tokens, total_experts), -10.0, dtype=torch.float32, device="cuda"
)
router_logits[:, num_experts] = 10.0 # zero expert gets high logit

with torch.no_grad():
for param in layer.parameters():
if param.dtype.is_floating_point:
param.normal_(0, 0.01)

with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
get_forward_context().all_moe_layers = None

# Run routing to get topk_weights/topk_ids before masking.
from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
fused_topk_bias,
)

topk_weights, topk_ids = fused_topk_bias(
hidden_states=hidden_states,
gating_output=router_logits,
e_score_correction_bias=layer.router.e_score_correction_bias.data,
topk=layer.top_k,
renormalize=layer.router.renormalize,
scoring_func=layer.router.scoring_func,
)

# Manually compute expected zero expert identity output:
# For each token, sum routing weights assigned to zero expert slots,
# then multiply by hidden_states.
zero_mask = topk_ids >= num_experts
zero_weight_per_token = (topk_weights * zero_mask.float()).sum(
dim=-1, keepdim=True
)
expected_zero_output = (hidden_states.float() * zero_weight_per_token).to(
hidden_states.dtype
)

# Run routing directly to trigger zero expert computation
# without going through the runner (which consumes the output).
layer.router.select_experts(hidden_states, router_logits)
actual_zero_output = layer.router.zero_expert_output

assert actual_zero_output is not None
assert zero_mask.any(), (
"With high zero expert logit, at least some slots should route "
"to the zero expert"
)

torch.testing.assert_close(
actual_zero_output,
expected_zero_output,
atol=1e-3,
rtol=1e-3,
msg="Zero expert identity output should equal "
"hidden_states * sum(zero_expert_weights)",
)
1 change: 1 addition & 0 deletions tools/ep_kernels/install_python_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ pushd "$WORKSPACE"
echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
tar -xf "${NVSHMEM_FILE}"
rm -rf nvshmem
mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
rm -f "${NVSHMEM_FILE}"
rm -rf nvshmem/lib/bin nvshmem/lib/share
Expand Down
8 changes: 4 additions & 4 deletions vllm/lora/layers/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,8 @@ def _inject_lora_into_fused_moe(self):
self.base_layer.shared_experts,
)

# TODO: could be incorrect due to monolithic kernel? or add assert it
# is modular?
if quant_config.use_mxfp4_w4a16:
assert isinstance(
m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
Expand Down Expand Up @@ -331,6 +333,8 @@ def wrapper(*args, **kwargs):

fused_experts = m_fused_moe_fn.fused_experts

# TODO: seems like this could be done with modular kernel subclasses?

m_fused_moe_fn.forward = fwd_decorator(self.base_layer, m_fused_moe_fn.forward)
fused_experts.activation = act_decorator(
self.base_layer, fused_experts.activation
Expand Down Expand Up @@ -585,10 +589,6 @@ def forward(self, *args, **kwargs):
def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs):
return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs)

@property
def _shared_experts(self):
return self.base_layer._shared_experts

@property
def quant_method(self):
return self.base_layer.quant_method
Expand Down
4 changes: 0 additions & 4 deletions vllm/model_executor/layers/fused_moe/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@
from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
UnquantizedFusedMoEMethod,
)
from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
ZeroExpertFusedMoE,
)
from vllm.triton_utils import HAS_TRITON

_config: dict[str, Any] | None = None
Expand Down Expand Up @@ -66,7 +63,6 @@ def get_config() -> dict[str, Any] | None:
"FusedMoEPrepareAndFinalize",
"RoutingMethodType",
"SharedFusedMoE",
"ZeroExpertFusedMoE",
"activation_without_mul",
"apply_moe_activation",
"override_config",
Expand Down
9 changes: 9 additions & 0 deletions vllm/model_executor/layers/fused_moe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,15 @@ class FusedMoEParallelConfig:
all2all_backend: str # all2all backend for MoE communication
enable_eplb: bool # whether to enable expert load balancing

@property
def use_dp_chunking(self) -> bool:
return (
self.use_pplx_kernels
or self.use_deepep_ll_kernels
or self.use_mori_kernels
or self.use_fi_all2allv_kernels
) and envs.VLLM_ENABLE_MOE_DP_CHUNK

@property
def is_sequence_parallel(self) -> bool:
return self.sp_size > 1
Expand Down
1 change: 1 addition & 0 deletions vllm/model_executor/layers/fused_moe/cutlass_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,7 @@ def cutlass_moe_w4a8_fp8(
quant_config=quant_config,
group_size=group_size,
),
shared_experts=None,
inplace=False,
)

Expand Down
Loading