[MoE Refactor] Introduce MoERunner abstraction and move execution logic from FusedMoE to DefaultMoERunner #32344

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

robertgshaw2-redhat merged 74 commits into vllm-project:main from neuralmagic:moe-runner-0

Feb 11, 2026

docs/design/moe_kernel_features.md

-Original file line number
+Diff line change
@@ Expand Up / @@ -32,7 +32,7 @@ th { @@
     | Backend | Output act. format | Quant. types | Quant. format | Async | Apply Weight On Input | Subclass |
     |---------|--------------------|--------------|---------------|-------|-----------------------|-----------|
-    | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE.forward_impl] |
+    | naive | standard | all<sup>1</sup> | G,A,T | N | <sup>6</sup> | [layer.py][vllm.model_executor.layers.fused_moe.layer.FusedMoE |
     | pplx | batched | fp8,int8 | G,A,T | Y | Y | [`PplxPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.pplx_prepare_finalize.PplxPrepareAndFinalize] |
     | deepep_high_throughput | standard | fp8 | G(128),A,T<sup>2</sup> | Y | Y | [`DeepEPLLPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize.DeepEPLLPrepareAndFinalize] |
     | deepep_low_latency | batched | fp8 | G(128),A,T<sup>3</sup> | Y | Y | [`DeepEPHTPrepareAndFinalize`][vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize.DeepEPHTPrepareAndFinalize] |
@@ Expand Down @@

tests/kernels/moe/modular_kernel_tools/common.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -585,6 +585,7 @@ def next_power_of_2(x): @@
             tp_size_=get_tensor_model_parallel_world_size(),
             pcp_size_=get_pcp_group().world_size,
             dp_size_=get_dp_group().world_size,
+            sp_size_=1,
             vllm_parallel_config=vllm_config.parallel_config,
         )
@@ Expand All / @@ -594,6 +595,7 @@ def next_power_of_2(x): @@
             hidden_dim=config.K,
             intermediate_size_per_partition=config.N,
             num_local_experts=config.num_local_experts,
+            num_logical_experts=config.E,
             moe_parallel_config=moe_parallel_config,
             in_dtype=config.dtype,
             max_num_tokens=next_power_of_2(config.M),
@@ Expand Down @@

tests/kernels/moe/utils.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -52,6 +52,7 @@ def make_dummy_moe_config( @@
             hidden_dim=hidden_dim,
             intermediate_size_per_partition=intermediate_size_per_partition,
             num_local_experts=num_experts,
+            num_logical_experts=num_experts,
             moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
             activation="silu",
             in_dtype=in_dtype,
@@ Expand Down @@

vllm/model_executor/layers/fused_moe/config.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -913,12 +913,16 @@ class FusedMoEParallelConfig: @@
         pcp_rank: int
         dp_rank: int
         ep_rank: int
+        sp_size: int
         use_ep: bool  # whether to use EP or not
         all2all_backend: str  # all2all backend for MoE communication
-        is_sequence_parallel: bool  # whether sequence parallelism is used
         enable_eplb: bool  # whether to enable expert load balancing
+        @property
+        def is_sequence_parallel(self) -> bool:
+            return self.sp_size > 1
         @property
         def use_all2all_kernels(self):
             return self.dp_size > 1 and self.use_ep
@@ Expand Down Expand Up / @@ -974,6 +978,7 @@ def make( @@
             tp_size_: int,
             pcp_size_: int,
             dp_size_: int,
+            sp_size_: int,
             vllm_parallel_config: ParallelConfig,
         ) -> "FusedMoEParallelConfig":
             """
@@ Expand Down Expand Up / @@ -1073,9 +1078,9 @@ def make( @@
                     dp_rank=dp_rank,
                     ep_size=1,
                     ep_rank=0,
+                    sp_size=sp_size_,
                     use_ep=False,
                     all2all_backend=vllm_parallel_config.all2all_backend,
-                    is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
                     enable_eplb=vllm_parallel_config.enable_eplb,
                 )
             # DP + EP / TP + EP / DP + TP + EP
@@ Expand All / @@ -1093,9 +1098,9 @@ def make( @@
                 dp_rank=dp_rank,
                 ep_size=ep_size,
                 ep_rank=ep_rank,
+                sp_size=sp_size_,
                 use_ep=True,
                 all2all_backend=vllm_parallel_config.all2all_backend,
-                is_sequence_parallel=vllm_parallel_config.use_sequence_parallel_moe,
                 enable_eplb=vllm_parallel_config.enable_eplb,
             )
@@ Expand All / @@ -1111,10 +1116,10 @@ def make_no_parallel(cls) -> "FusedMoEParallelConfig": @@
                 dp_rank=0,
                 ep_size=1,
                 ep_rank=0,
+                sp_size=1,
                 use_ep=False,
                 all2all_backend="naive",
                 enable_eplb=False,
-                is_sequence_parallel=False,
             )
@@ Expand All / @@ -1126,6 +1131,7 @@ class FusedMoEConfig: @@
         hidden_dim: int
         intermediate_size_per_partition: int
         num_local_experts: int
+        num_logical_experts: int
         activation: str
         device: torch.device | str
         routing_method: RoutingMethodType
@@ Expand Down Expand Up / @@ -1175,6 +1181,14 @@ def pcp_size(self): @@
         def ep_size(self):
             return self.moe_parallel_config.ep_size
+        @property
+        def sp_size(self):
+            return self.moe_parallel_config.sp_size
+        @property
+        def is_sequence_parallel(self):
+            return self.moe_parallel_config.is_sequence_parallel
         @property
         def tp_rank(self):
             return self.moe_parallel_config.tp_rank
@@ Expand Down @@

vllm/model_executor/layers/fused_moe/fused_moe_method_base.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -121,17 +121,16 @@ def method_name(self) -> str: @@
         def is_monolithic(self) -> bool:
             return False
-        # @abstractmethod
         def apply(
             self,
             layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
             x: torch.Tensor,
             topk_weights: torch.Tensor,
             topk_ids: torch.Tensor,
+            shared_experts_input: torch.Tensor | None,
         ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
             raise NotImplementedError
-        # @abstractmethod
         def apply_monolithic(
             self,
             layer: "FusedMoE",  # type: ignore[name-defined] # noqa: F821
@@ Expand Down @@

vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -89,6 +89,7 @@ def apply( @@
             x: torch.Tensor,
             topk_weights: torch.Tensor,
             topk_ids: torch.Tensor,
+            shared_experts_input: torch.Tensor | None,
         ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
             assert self.moe_mk is not None
             return self.moe_mk(
@@ Expand All / @@ -101,5 +102,5 @@ def apply( @@
                 global_num_experts=layer.global_num_experts,
                 apply_router_weight_on_input=layer.apply_router_weight_on_input,
                 expert_map=None if self.disable_expert_map else layer.expert_map,
-                shared_experts_input=layer._get_shared_experts_input(x),
+                shared_experts_input=shared_experts_input,
             )

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[MoE Refactor] Introduce MoERunner abstraction and move execution logic from FusedMoE to DefaultMoERunner #32344

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!