Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
be73f32
initial MoERunner refactor
bnellnm Jan 13, 2026
267d118
wip
bnellnm Jan 14, 2026
b96042c
wip
bnellnm Jan 14, 2026
77deb59
add missing file
bnellnm Jan 14, 2026
cb21052
add some methods
bnellnm Jan 14, 2026
3d32744
isolate more state
bnellnm Jan 14, 2026
17be1d5
make gate into a property
bnellnm Jan 14, 2026
9a238fd
move default runner to separate file
bnellnm Jan 14, 2026
b416bdd
fix merge
bnellnm Jan 15, 2026
7c51aaa
hacky fix for initialization issues
bnellnm Jan 16, 2026
147e7a9
fix merge
bnellnm Jan 16, 2026
8403a2f
fix
bnellnm Jan 16, 2026
18895ba
move files to runner directory
bnellnm Jan 16, 2026
781d2df
simplify runner
bnellnm Jan 16, 2026
5305bf1
simplify runner forward
bnellnm Jan 16, 2026
b472fc4
simplify runner forward
bnellnm Jan 16, 2026
60f15d4
move some code
bnellnm Jan 22, 2026
1f7e762
fix merge
bnellnm Jan 23, 2026
fb05dfa
cleanups
bnellnm Jan 23, 2026
9a288e4
reduce op registration scope
bnellnm Jan 23, 2026
40cd1c2
fixes
bnellnm Jan 23, 2026
2c063b8
fix
bnellnm Jan 26, 2026
9e98441
fix merge
bnellnm Jan 30, 2026
d434453
fix merge
bnellnm Jan 30, 2026
6f31111
fix lint
bnellnm Feb 4, 2026
f6eaa36
fixes
bnellnm Feb 4, 2026
a83e79c
add comments
bnellnm Feb 4, 2026
554e954
fix merge
bnellnm Feb 4, 2026
52230d7
reimplement routed input transform feature
bnellnm Feb 5, 2026
a2fcee8
remove cruft
bnellnm Feb 5, 2026
de9a318
fix removal of padding when theres an input transform
bnellnm Feb 5, 2026
949634c
fix shared_experts
bnellnm Feb 6, 2026
f1f3f1e
fixes + revert to old op registration
bnellnm Feb 7, 2026
6bb0768
fix merge
bnellnm Feb 7, 2026
17bb403
initial MoERunner refactor
bnellnm Jan 13, 2026
ab3fe1d
wip
bnellnm Jan 14, 2026
e706f02
wip
bnellnm Jan 14, 2026
5af18c6
add missing file
bnellnm Jan 14, 2026
a1687b8
add some methods
bnellnm Jan 14, 2026
79a69e4
isolate more state
bnellnm Jan 14, 2026
1af2390
make gate into a property
bnellnm Jan 14, 2026
c1a1389
move default runner to separate file
bnellnm Jan 14, 2026
76b1dac
fix merge
bnellnm Jan 15, 2026
b36032c
hacky fix for initialization issues
bnellnm Jan 16, 2026
05c154f
fix merge
bnellnm Jan 16, 2026
10f3ab1
fix
bnellnm Jan 16, 2026
fa0f9cf
move files to runner directory
bnellnm Jan 16, 2026
cbbdd7b
simplify runner
bnellnm Jan 16, 2026
bcaae58
simplify runner forward
bnellnm Jan 16, 2026
d5eef04
simplify runner forward
bnellnm Jan 16, 2026
93f4e63
move some code
bnellnm Jan 22, 2026
c88a2bf
fix merge
bnellnm Jan 23, 2026
1b60c89
cleanups
bnellnm Jan 23, 2026
bd0bb28
reduce op registration scope
bnellnm Jan 23, 2026
bdd87d1
fixes
bnellnm Jan 23, 2026
2d05721
fix
bnellnm Jan 26, 2026
4cfd11b
fix merge
bnellnm Jan 30, 2026
8926336
fix merge
bnellnm Jan 30, 2026
88fceda
fix lint
bnellnm Feb 4, 2026
8fb283d
fixes
bnellnm Feb 4, 2026
4bf4156
add comments
bnellnm Feb 4, 2026
e375e1b
fix merge
bnellnm Feb 4, 2026
03974a3
reimplement routed input transform feature
bnellnm Feb 5, 2026
69801a2
remove cruft
bnellnm Feb 5, 2026
bd10b8b
fix removal of padding when theres an input transform
bnellnm Feb 5, 2026
3afd14d
fix shared_experts
bnellnm Feb 6, 2026
1b5f9bd
fixes + revert to old op registration
bnellnm Feb 7, 2026
5502c79
fix merge
bnellnm Feb 7, 2026
2ab2383
fix is_sequence_parallel
bnellnm Feb 9, 2026
3194f6b
Merge remote-tracking branch 'nm-vllm/moe-runner-0' into moe-runner-0
bnellnm Feb 9, 2026
7d78524
fix shared_experts_input merge issues
bnellnm Feb 9, 2026
f7e48ce
fix mxfp4 marlin padding
bnellnm Feb 10, 2026
34eef09
Merge remote-tracking branch 'origin/main' into moe-runner-0
bnellnm Feb 10, 2026
1e92a8d
disable assert for now
bnellnm Feb 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/design/moe_kernel_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ th {

| Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
|---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
| naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE |
| pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
| deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
| deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
Expand Down
2 changes: 2 additions & 0 deletions tests/kernels/moe/modular_kernel_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,7 @@ def next_power_of_2(x):
tp_size_=get_tensor_model_parallel_world_size(),
pcp_size_=get_pcp_group().world_size,
dp_size_=get_dp_group().world_size,
sp_size_=1,
vllm_parallel_config=vllm_config.parallel_config,
)

Expand All @@ -594,6 +595,7 @@ def next_power_of_2(x):
hidden_dim=config.K,
intermediate_size_per_partition=config.N,
num_local_experts=config.num_local_experts,
num_logical_experts=config.E,
moe_parallel_config=moe_parallel_config,
in_dtype=config.dtype,
max_num_tokens=next_power_of_2(config.M),
Expand Down
1 change: 1 addition & 0 deletions tests/kernels/moe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def make_dummy_moe_config(
hidden_dim=hidden_dim,
intermediate_size_per_partition=intermediate_size_per_partition,
num_local_experts=num_experts,
num_logical_experts=num_experts,
moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
activation="silu",
in_dtype=in_dtype,
Expand Down
22 changes: 18 additions & 4 deletions vllm/model_executor/layers/fused_moe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -913,12 +913,16 @@ class FusedMoEParallelConfig:
pcp_rank: int
dp_rank: int
ep_rank: int
sp_size: int

use_ep: bool # whether to use EP or not
all2all_backend: str # all2all backend for MoE communication
is_sequence_parallel: bool # whether sequence parallelism is used
enable_eplb: bool # whether to enable expert load balancing

@property
def is_sequence_parallel(self) -> bool:
return self.sp_size > 1

@property
def use_all2all_kernels(self):
return self.dp_size > 1 and self.use_ep
Expand Down Expand Up @@ -974,6 +978,7 @@ def make(
tp_size_: int,
pcp_size_: int,
dp_size_: int,
sp_size_: int,
vllm_parallel_config: ParallelConfig,
) -> "FusedMoEParallelConfig":
"""
Expand Down Expand Up @@ -1073,9 +1078,9 @@ def make(
dp_rank=dp_rank,
ep_size=1,
ep_rank=0,
sp_size=sp_size_,
use_ep=False,
all2all_backend=vllm_parallel_config.all2all_backend,
is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
enable_eplb=vllm_parallel_config.enable_eplb,
)
# DP + EP / TP + EP / DP + TP + EP
Expand All @@ -1093,9 +1098,9 @@ def make(
dp_rank=dp_rank,
ep_size=ep_size,
ep_rank=ep_rank,
sp_size=sp_size_,
use_ep=True,
all2all_backend=vllm_parallel_config.all2all_backend,
is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
enable_eplb=vllm_parallel_config.enable_eplb,
)

Expand All @@ -1111,10 +1116,10 @@ def make_no_parallel(cls) -> "FusedMoEParallelConfig":
dp_rank=0,
ep_size=1,
ep_rank=0,
sp_size=1,
use_ep=False,
all2all_backend="naive",
enable_eplb=False,
is_sequence_parallel=False,
)


Expand All @@ -1126,6 +1131,7 @@ class FusedMoEConfig:
hidden_dim: int
intermediate_size_per_partition: int
num_local_experts: int
num_logical_experts: int
activation: str
device: torch.device | str
routing_method: RoutingMethodType
Expand Down Expand Up @@ -1175,6 +1181,14 @@ def pcp_size(self):
def ep_size(self):
return self.moe_parallel_config.ep_size

@property
def sp_size(self):
return self.moe_parallel_config.sp_size

@property
def is_sequence_parallel(self):
return self.moe_parallel_config.is_sequence_parallel

@property
def tp_rank(self):
return self.moe_parallel_config.tp_rank
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,16 @@ def method_name(self) -> str:
def is_monolithic(self) -> bool:
return False

# @abstractmethod
def apply(
self,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
x: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
shared_experts_input: torch.Tensor | None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
raise NotImplementedError

# @abstractmethod
def apply_monolithic(
self,
layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def apply(
x: torch.Tensor,
topk_weights: torch.Tensor,
topk_ids: torch.Tensor,
shared_experts_input: torch.Tensor | None,
) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
assert self.moe_mk is not None
return self.moe_mk(
Expand All @@ -101,5 +102,5 @@ def apply(
global_num_experts=layer.global_num_experts,
apply_router_weight_on_input=layer.apply_router_weight_on_input,
expert_map=None if self.disable_expert_map else layer.expert_map,
shared_experts_input=layer._get_shared_experts_input(x),
shared_experts_input=shared_experts_input,
)
Loading