Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
126 commits
Select commit Hold shift + click to select a range
4aeabf2
initial MoERunner refactor
bnellnm Jan 13, 2026
a4d3acb
fix lint
bnellnm Feb 12, 2026
5b7f133
rebase
bnellnm Feb 24, 2026
fad7f33
rebase + remove dead code
bnellnm Mar 5, 2026
83c1863
wip
bnellnm Feb 4, 2026
7c7953e
fix
bnellnm Feb 9, 2026
d5b5805
WIP DOUBLE CHECK THIS
bnellnm Feb 11, 2026
42de827
wip more refactoring
bnellnm Feb 19, 2026
2e4ce00
wip
bnellnm Feb 19, 2026
0144b8b
SharedExperts wip
bnellnm Feb 23, 2026
27ab769
cleanups
bnellnm Feb 23, 2026
bf8b3f3
fix circular import
bnellnm Feb 23, 2026
bb541ae
fixes
bnellnm Feb 24, 2026
0eb70c6
renames
bnellnm Feb 24, 2026
8351d89
add comment
bnellnm Feb 24, 2026
943e667
more renames
bnellnm Feb 24, 2026
0ae2b4f
cleanup
bnellnm Feb 25, 2026
657a4ef
remove memoizing router, not needed yet
bnellnm Feb 26, 2026
6b4ef3b
fix UBD bug
bnellnm Feb 27, 2026
125b34b
cleanup merge
bnellnm Mar 5, 2026
78d5ed8
fix merge
bnellnm Mar 5, 2026
8a5445a
fix merge
bnellnm Mar 5, 2026
5ee510f
fix typos
bnellnm Mar 5, 2026
d675882
fix merge
bnellnm Mar 18, 2026
1dc047a
fix format
bnellnm Mar 18, 2026
111c628
Split of DefaultMoERunner class
bnellnm Jan 13, 2026
ced8fec
fix merge
bnellnm Mar 18, 2026
5c8d40d
fix merge
bnellnm Mar 18, 2026
74eeba9
attempt to fix zero experts
bnellnm Feb 26, 2026
45a48b0
simplify ZeroExpertFusedMoE and add ZeroExpertRouter
bnellnm Feb 27, 2026
65be7f7
add value test
bnellnm Feb 27, 2026
3b88ab2
move ZeroExpertRouter construction into router factory
bnellnm Feb 27, 2026
6cd80fc
move zero expert handling into MoERunnerBase
bnellnm Feb 27, 2026
992672b
slightly improved test
bnellnm Feb 27, 2026
900fc40
simplifications
bnellnm Feb 27, 2026
71f8fb8
better test
bnellnm Feb 27, 2026
c0e12b5
remove ZeroExpertFusedMoE
bnellnm Feb 27, 2026
47205a3
Add comment
bnellnm Mar 2, 2026
a7af12c
move shared expert all gather to SharedExperts
bnellnm Mar 3, 2026
51da43b
remove must_reduce_shared_expert_outputs external method
bnellnm Mar 3, 2026
7834021
wip moving experts epilog into MoERunnerBase
bnellnm Mar 3, 2026
3ad7dc5
cleanups
bnellnm Mar 3, 2026
eb38e88
apply_scale_to_output flag
bnellnm Mar 4, 2026
1c17922
fix fp16 scaling factor stuff
bnellnm Mar 4, 2026
e811095
cleanups
bnellnm Mar 4, 2026
2df1794
add claude generated comments
bnellnm Mar 4, 2026
08ac61c
move stuff out ot custom op
bnellnm Mar 4, 2026
9d39ea8
fix transformers/moe.py
bnellnm Mar 4, 2026
65b0f32
tweak op registration
bnellnm Mar 18, 2026
9eb54fd
fix rebase
bnellnm Mar 18, 2026
8e53180
fix merge
bnellnm Mar 18, 2026
3b463bc
fix lint
bnellnm Mar 18, 2026
ec88db3
fix gate overlap
bnellnm Mar 19, 2026
76aff0a
wip
bnellnm Feb 4, 2026
4fab915
fix
bnellnm Feb 9, 2026
d8a7f91
WIP DOUBLE CHECK THIS
bnellnm Feb 11, 2026
3dec78f
wip more refactoring
bnellnm Feb 19, 2026
e94b863
wip
bnellnm Feb 19, 2026
6cc5074
SharedExperts wip
bnellnm Feb 23, 2026
e8865e6
cleanups
bnellnm Feb 23, 2026
f83e0f5
fix circular import
bnellnm Feb 23, 2026
88e80b9
fixes
bnellnm Feb 24, 2026
781d4ea
renames
bnellnm Feb 24, 2026
3695016
add comment
bnellnm Feb 24, 2026
053f66f
more renames
bnellnm Feb 24, 2026
708dd2b
cleanup
bnellnm Feb 25, 2026
5748f7c
remove memoizing router, not needed yet
bnellnm Feb 26, 2026
9123f15
fix UBD bug
bnellnm Feb 27, 2026
04b430f
cleanup merge
bnellnm Mar 5, 2026
526db38
fix merge
bnellnm Mar 5, 2026
67bdab2
fix merge
bnellnm Mar 5, 2026
e9afbe6
fix typos
bnellnm Mar 5, 2026
453ab3d
fix merge
bnellnm Mar 18, 2026
48acc59
fix format
bnellnm Mar 18, 2026
c067844
fix gate overlap
bnellnm Mar 19, 2026
9f0e8d7
merge with main
bnellnm Mar 19, 2026
bc82978
renames, revert lora changes
bnellnm Mar 19, 2026
24da9fb
fix lint
bnellnm Mar 19, 2026
3dc9d4f
review comments + cleanup
bnellnm Mar 20, 2026
12bda3d
remove _must_reduce_shared_expert_outputs
bnellnm Mar 20, 2026
8aaddea
undo some changes + add Rob's changes
bnellnm Mar 23, 2026
bbaaca7
Merge remote-tracking branch 'origin/main' into moe-runner-2
bnellnm Mar 23, 2026
392f311
hacky fix for unquantized method
bnellnm Mar 23, 2026
7d5adbe
fix lint
bnellnm Mar 23, 2026
f345165
fix lint
bnellnm Feb 12, 2026
bdefdf5
fix merge
bnellnm Mar 25, 2026
377acc8
fix merge
bnellnm Mar 25, 2026
14e58dc
don't pass shared_experts to MK in lora code
bnellnm Mar 25, 2026
392fb60
Merge remote-tracking branch 'origin/main' into moe-runner-3
bnellnm Apr 1, 2026
7b86f43
remove cruft
bnellnm Apr 1, 2026
fd7a324
review comments
bnellnm Apr 1, 2026
ea79ff7
fix lint
bnellnm Apr 1, 2026
b6ed07b
remove EXTERNAL SharedExperts order
bnellnm Apr 1, 2026
dd1f23a
make sure some methods are handled properly on ChunkingMoERunner
bnellnm Apr 1, 2026
acebc42
fixes
bnellnm Apr 2, 2026
b08bf02
Merge branch 'main' into moe-runner-3
bnellnm Apr 2, 2026
73a0356
Merge remote-tracking branch 'origin/main' into moe-runner-3
bnellnm Apr 2, 2026
9934e37
Merge branch 'main' into moe-runner-3
bnellnm Apr 3, 2026
ed0ff6e
remove assert
bnellnm Apr 3, 2026
0d2b4dc
Merge remote-tracking branch 'origin/main' into moe-runner-3
bnellnm Apr 3, 2026
bae2080
Merge remote-tracking branch 'nm-vllm/moe-runner-3' into moe-runner-3
bnellnm Apr 3, 2026
e015682
merge with moe-runner-3
bnellnm Apr 3, 2026
30da43c
remove memoizing router
bnellnm Apr 3, 2026
3b12500
merge with moe-runner-4
bnellnm Apr 3, 2026
574e5c1
fix merge + remove new reduce_results
bnellnm Apr 3, 2026
5f86ac4
fix merge
bnellnm Apr 3, 2026
5785a4e
merge
bnellnm Apr 6, 2026
0405c75
revert bogus changes
bnellnm Apr 6, 2026
e48b663
Merge branch 'moe-runner-4' into moe-runner-5
bnellnm Apr 6, 2026
6069933
Merge remote-tracking branch 'origin/main' into moe-runner-5
bnellnm Apr 14, 2026
dd4a254
fix up additional models
bnellnm Apr 14, 2026
574e61f
tweaks
bnellnm Apr 14, 2026
fe9ec8b
remove tests
bnellnm Apr 14, 2026
31c0491
fix layer test
bnellnm Apr 14, 2026
66dfe9f
Merge branch 'main' into moe-runner-5
robertgshaw2-redhat Apr 16, 2026
2a25066
review comments
bnellnm Apr 16, 2026
7723e8f
Merge remote-tracking branch 'nm-vllm/moe-runner-5' into moe-runner-5
bnellnm Apr 16, 2026
643065b
Merge branch 'main' into moe-runner-5
bnellnm Apr 16, 2026
73ecb6b
fix SharedFusedMoE return type
bnellnm Apr 16, 2026
dbe1291
Merge remote-tracking branch 'nm-vllm/moe-runner-5' into moe-runner-5
bnellnm Apr 16, 2026
f0fa198
try again
bnellnm Apr 16, 2026
d071dd7
Merge remote-tracking branch 'origin/main' into moe-runner-5
bnellnm Apr 17, 2026
b83a93b
renames, add comments, simplify scaling factor handling
bnellnm Apr 17, 2026
0c1f88c
fix bug in routed scale initialization. move output transform to pro…
bnellnm Apr 17, 2026
4d91791
move contiguous back to AR call
bnellnm Apr 17, 2026
2ed7088
move trunc after reduce
bnellnm Apr 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions tests/compile/passes/test_vllm_fusion_pattern_matcher_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@

import vllm.config
from tests.compile.backend import TestBackend
from vllm.platforms import current_platform
from vllm.compilation.passes.vllm_inductor_pass import (
VllmFusionPatternMatcherPass,
VllmPatternMatcherPass,
VllmPatternReplacement,
)
from vllm.config import CompilationConfig, CompilationMode, VllmConfig

from vllm.platforms import current_platform


class ReluToAbsPattern(VllmPatternReplacement):
Expand Down Expand Up @@ -58,7 +57,6 @@ def get_inputs(self) -> list[torch.Tensor]:
return [self.empty_fp32(4)]



class ReluFusionPass(VllmFusionPatternMatcherPass):
def __init__(self, config: VllmConfig) -> None:
super().__init__(config, "test_relu_fusion")
Expand All @@ -72,13 +70,13 @@ def __init__(self, config: VllmConfig) -> None:
self.register(ExpToSqrtPattern())



@pytest.fixture
def vllm_config():
return VllmConfig(
compilation_config=CompilationConfig(mode=CompilationMode.VLLM_COMPILE),
)


@pytest.mark.skipif(not current_platform.is_cuda_alike(), reason="Requires CUDA")
def test_register_tracks_patterns(vllm_config):
"""register() appends each VllmPatternReplacement to _pattern_replacements."""
Expand All @@ -96,7 +94,7 @@ def test_uuid_stable(vllm_config):
with vllm.config.set_current_vllm_config(vllm_config):
p1 = ReluFusionPass(vllm_config)
p2 = ReluFusionPass(vllm_config)
p3= TwoPatternFusionPass(vllm_config)
p3 = TwoPatternFusionPass(vllm_config)

assert p1.uuid() == p2.uuid()
assert p1.uuid() != p3.uuid()
Expand Down
93 changes: 17 additions & 76 deletions tests/kernels/moe/test_moe_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,6 @@ class MoETestConfig:
use_gate: bool
use_routed_input_transform: bool
enable_eplb: bool = False
reduce_results: bool = False
backend: str | None = None
ep_size: int = 1
dp_size: int = 1
Expand Down Expand Up @@ -295,7 +294,6 @@ def generate_valid_test_configs(
use_shared_experts,
use_gate,
use_routed_input_transform,
reduce_results,
) in product(
SHAPE_COMBOS,
NUM_EXPERTS,
Expand All @@ -304,7 +302,6 @@ def generate_valid_test_configs(
[False, True], # shared
[False, True], # gate
[False, True], # routed input exform
[False, True], # reduce results
):
config = MoETestConfig(
shape[0], # m
Expand All @@ -318,7 +315,6 @@ def generate_valid_test_configs(
use_gate,
use_routed_input_transform,
enable_eplb,
reduce_results,
backend,
ep_size,
dp_size,
Expand Down Expand Up @@ -395,18 +391,7 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]:
and config.backend.startswith("flashinfer_nvlink")
and not current_platform.has_device_capability(90)
):
return False, "flashinfer_nvlink needs an H100+ GPUs"

# reduce_results incompatibilities
if config.reduce_results and config.use_shared_experts:
return False, "reduce_results=True is not compatible with shared_experts=True"

if config.reduce_results and config.quantization is not None:
return (
False,
"reduce_results=True only tested with unquantized data types in "
"order to limit number of tests run",
)
return False, "flashinfer_nvlink needs H100+ GPUs"

# Backend-specific checks
if config.backend is not None:
Expand Down Expand Up @@ -448,10 +433,6 @@ def is_valid_config(config: MoETestConfig) -> tuple[bool, str | None]:
if config.enable_eplb and config.backend not in EPLB_SUPPORTED_BACKENDS:
return False, f"EPLB not supported with {config.backend}."

world_size = config.tp_size * config.dp_size
if config.reduce_results and world_size == 1:
return False, "reduce_results=True only makes sense for multi-GPU tests"

if (
config.backend is not None
and config.backend.startswith("flashinfer_nvlink")
Expand Down Expand Up @@ -846,7 +827,6 @@ def make_fused_moe_layer(
tp_size: int,
ep_size: int,
dp_size: int,
reduce_results: bool,
w1: torch.Tensor,
w2: torch.Tensor,
top_k: int,
Expand Down Expand Up @@ -874,7 +854,7 @@ def make_fused_moe_layer(
routed_input_transform: torch.nn.Module | None = None,
routed_output_transform: torch.nn.Module | None = None,
pcp_size: int | None = 1,
) -> tuple[Callable, FusedMoE]:
) -> FusedMoE:
quant_config, qw = make_quant_config(quantization, w1, w2, global_num_experts)

kwargs = dict()
Expand All @@ -887,16 +867,17 @@ def make_fused_moe_layer(
# Add gate and routed_input_transform if provided
if gate is not None:
kwargs["gate"] = gate

if routed_input_transform is not None:
kwargs["routed_input_transform"] = routed_input_transform
kwargs["routed_output_transform"] = routed_output_transform

layer = builder(
num_experts=global_num_experts,
top_k=top_k,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
params_dtype=in_dtype,
reduce_results=reduce_results,
renormalize=renormalize,
use_grouped_topk=use_grouped_topk,
num_expert_group=num_expert_group,
Expand Down Expand Up @@ -936,36 +917,7 @@ def make_fused_moe_layer(

layer.quant_method.process_weights_after_loading(layer)

def _moe(
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
) -> torch.Tensor:
if shared_experts is None:
final_shared_states = None
final_hidden_states = layer(hidden_states, router_logits)
else:
final_shared_states, final_hidden_states = layer(
hidden_states, router_logits
)

# Apply routed output transform if provided
# (e.g., latent space -> original space)
if routed_output_transform is not None:
final_hidden_states = routed_output_transform(final_hidden_states)

if shared_experts is not None:
assert not reduce_results
assert final_shared_states is not None
final_hidden_states += final_shared_states

if not reduce_results and layer.tp_size > 1:
final_hidden_states = layer.maybe_all_reduce_tensor_model_parallel(
final_hidden_states
)

return final_hidden_states

return _moe, layer
return layer


def make_fake_moe_layer(
Expand Down Expand Up @@ -999,7 +951,6 @@ def make_fake_moe_layer(
tp_size: int = 1,
dp_size: int = 1,
ep_size: int = 1,
reduce_results: bool = False,
) -> Callable:
activation = MoEActivation.from_str(activation)

Expand Down Expand Up @@ -1101,7 +1052,7 @@ def _moe(


def _test_body_regular(
moe_fn: Callable,
moe_layer: Callable,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
vllm_config: VllmConfig,
Expand All @@ -1118,13 +1069,12 @@ def _test_body_regular(
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
):
output = moe_fn(hidden_states, router_logits)
output = moe_layer(hidden_states, router_logits)

return baseline_output, output


def _test_body_eplb(
moe_fn: Callable,
moe_layer: FusedMoE,
hidden_states: torch.Tensor,
router_logits: torch.Tensor,
Expand All @@ -1145,7 +1095,6 @@ def _test_body_eplb(
n: int,
top_k: int,
shared_experts,
reduce_results: bool,
gate: torch.nn.Module | None,
routed_input_transform: torch.nn.Module | None,
routed_output_transform: torch.nn.Module | None,
Expand All @@ -1161,7 +1110,7 @@ def _test_body_eplb(
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
):
output_before = moe_fn(hidden_states, router_logits)
output_before = moe_layer(hidden_states, router_logits)

# Create a fresh FusedMoE layer with enable_eplb=True
# Delete the original layer's registration so the constructor can
Expand All @@ -1174,7 +1123,7 @@ def _test_body_eplb(
# When using routed_input_transform, experts operate in latent space
hidden_size_for_layer = k // 2 if routed_input_transform is not None else k

moe_fn, moe_layer = make_fused_moe_layer(
eplb_moe_layer = make_fused_moe_layer(
quantization=quantization,
use_ep=use_ep,
hidden_size=hidden_size_for_layer,
Expand All @@ -1183,7 +1132,6 @@ def _test_body_eplb(
tp_size=tp_size,
ep_size=ep_size,
dp_size=dp_size,
reduce_results=reduce_results,
w1=w1,
w2=w2,
top_k=top_k,
Expand All @@ -1196,14 +1144,14 @@ def _test_body_eplb(
)

# Necessary?
if moe_layer._expert_map is not None:
moe_layer._expert_map = moe_layer._expert_map.to(device)
if eplb_moe_layer._expert_map is not None:
eplb_moe_layer._expert_map = eplb_moe_layer._expert_map.to(device)

# All ranks must generate the same permutation
initial_indices = torch.arange(num_experts, dtype=torch.long)
shuffled_indices = initial_indices[torch.randperm(num_experts)]

expert_weights = [list(moe_layer.get_expert_weights())]
expert_weights = [list(eplb_moe_layer.get_expert_weights())]

communicator = create_eplb_communicator(
group_coordinator=get_eplb_group(),
Expand All @@ -1227,7 +1175,7 @@ def _test_body_eplb(
num_experts, dtype=torch.int32, device=device
)

moe_layer.set_eplb_state(
eplb_moe_layer.set_eplb_state(
moe_layer_idx=0,
expert_load_view=torch.zeros(
(1, num_experts),
Expand All @@ -1244,7 +1192,7 @@ def _test_body_eplb(
),
)

moe_layer.eplb_state.should_record_tensor = torch.ones(
eplb_moe_layer.eplb_state.should_record_tensor = torch.ones(
(), dtype=torch.bool, device=device
)

Expand All @@ -1255,7 +1203,7 @@ def _test_body_eplb(
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
):
output_after = moe_fn(hidden_states, router_logits)
output_after = eplb_moe_layer(hidden_states, router_logits)

return output_before, output_after

Expand All @@ -1274,7 +1222,6 @@ def _run_one_config(
num_experts: int,
top_k: int,
quantization: str | None,
reduce_results: bool,
backend: str | None,
test_body_fn: Callable,
use_shared_experts: bool,
Expand Down Expand Up @@ -1341,7 +1288,6 @@ def _run_one_config(
tp_size=tp_size,
ep_size=ep_size,
dp_size=dp_size,
reduce_results=reduce_results,
)

baseline_output = baseline_layer(hidden_states, router_logits)
Expand Down Expand Up @@ -1369,7 +1315,7 @@ def _run_one_config(
hidden_size_for_layer = k // 2 if routed_input_transform is not None else k

# Create initial MoE layer
moe_fn, moe_layer = make_fused_moe_layer(
moe_layer = make_fused_moe_layer(
quantization=quantization,
use_ep=use_ep,
hidden_size=hidden_size_for_layer,
Expand All @@ -1378,7 +1324,6 @@ def _run_one_config(
tp_size=tp_size,
ep_size=ep_size,
dp_size=dp_size,
reduce_results=reduce_results,
w1=w1,
w2=w2,
top_k=top_k,
Expand All @@ -1402,7 +1347,6 @@ def _run_one_config(

# Call the test body function with all necessary context
expected, actual = test_body_fn(
moe_fn=moe_fn,
moe_layer=moe_layer,
hidden_states=hidden_states,
router_logits=router_logits,
Expand All @@ -1423,7 +1367,6 @@ def _run_one_config(
m=m,
top_k=top_k,
shared_experts=shared_experts,
reduce_results=reduce_results,
gate=gate,
routed_input_transform=routed_input_transform,
routed_output_transform=routed_output_transform,
Expand Down Expand Up @@ -1520,7 +1463,6 @@ def test_moe_layer_no_parallel(
test_config.num_experts,
test_config.top_k,
test_config.quantization,
test_config.reduce_results,
test_config.backend,
_test_body_regular,
use_shared_experts=test_config.use_shared_experts,
Expand Down Expand Up @@ -1578,7 +1520,6 @@ def _parallel_worker(
test_config.num_experts,
test_config.top_k,
test_config.quantization,
test_config.reduce_results,
test_config.backend,
functools.partial(
_test_body_config, test_config=test_config, cpu_group=cpu_group
Expand All @@ -1597,7 +1538,7 @@ def _parallel_worker(
failed = failed + 1
if verbosity > 0:
traceback.print_exc()
print(f"\n{str(ex)}\nFAILED {ex.__class__}")
print(f"\n{str(ex)}\nFAILED")
else:
print("F", end="")
finally:
Expand Down
Loading
Loading