vllm-project · bnellnm · Jan 13, 2026 · Feb 12, 2026 · Feb 24, 2026 · Mar 5, 2026
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
@@ -250,7 +250,7 @@ def run():
                     num_experts=num_experts,
                     experts_per_token=topk,
                     hidden_dim=hidden_size,
-                    intermediate_size_per_partition=shard_intermediate_size,
+                    intermediate_size=shard_intermediate_size,
                     num_local_experts=num_experts,
                     num_logical_experts=num_experts,
                     activation=MoEActivation.SILU,

diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
@@ -60,7 +60,7 @@ Modular kernels are supported by the following `FusedMoEMethodBase` classes.
 - [`CompressedTensorsW4A4Nvfp4MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w4a4_nvfp4.CompressedTensorsW4A4Nvfp4MoEMethod]
 - [`CompressedTensorsW8A8Fp8MoEMethod`][vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe.compressed_tensors_moe_w8a8_fp8.CompressedTensorsW8A8Fp8MoEMethod]
 - [`GptOssMxfp4MoEMethod`][vllm.model_executor.layers.quantization.mxfp4.GptOssMxfp4MoEMethod]
-- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.layer.UnquantizedFusedMoEMethod]
+- [`UnquantizedFusedMoEMethod`][vllm.model_executor.layers.fused_moe.UnquantizedFusedMoEMethod]
 
 ## Fused Experts Kernels
 

diff --git a/tests/distributed/test_eplb_fused_moe_layer.py b/tests/distributed/test_eplb_fused_moe_layer.py
@@ -8,10 +8,12 @@
 import pytest
 import torch
 
-from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
+    get_eplb_group,
     get_tp_group,
 )
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
@@ -75,9 +77,9 @@ def make_fused_moe_layer(
         intermediate_size=test_config.intermediate_size,
         prefix=f"dummy_layer_{layer_idx}",
         activation="silu",
-        is_act_and_mul=True,
         params_dtype=test_config.weight_dtype,
     )
+    re = fml.routed_experts
 
     device = torch.device(f"cuda:{rank}")
 
@@ -90,12 +92,12 @@ def make_fused_moe_layer(
         tensor_device=device,
     )
 
-    assert isinstance(fml.w13_weight.data, torch.Tensor)
-    assert isinstance(fml.w2_weight.data, torch.Tensor)
-    fml.w13_weight.data = fml.w13_weight.data.to(device=device)
-    fml.w2_weight.data = fml.w2_weight.data.to(device=device)
-    w13_weight = fml.w13_weight.data
-    w2_weight = fml.w2_weight.data
+    assert isinstance(re.w13_weight.data, torch.Tensor)
+    assert isinstance(re.w2_weight.data, torch.Tensor)
+    re.w13_weight.data = re.w13_weight.data.to(device=device)
+    re.w2_weight.data = re.w2_weight.data.to(device=device)
+    w13_weight = re.w13_weight.data
+    w2_weight = re.w2_weight.data
     assert w13_weight.size(0) == test_config.num_local_experts
     for i in range(test_config.num_local_experts):
         g_i = rank * test_config.num_local_experts + i
@@ -170,10 +172,10 @@ def block_quant_scales_shape(
         assert not w2_weight_scale_inv.is_contiguous()
 
     # Add scales to the parameter list
-    fml.w13_weight_scale_inv = torch.nn.Parameter(
+    re.w13_weight_scale_inv = torch.nn.Parameter(
         w13_weight_scale_inv, requires_grad=False
     )
-    fml.w2_weight_scale_inv = torch.nn.Parameter(
+    re.w2_weight_scale_inv = torch.nn.Parameter(
         w2_weight_scale_inv, requires_grad=False
     )
 
@@ -185,9 +187,12 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
     # to expert parallel)
     set_env_vars_and_device(env)
 
-    vllm_config = VllmConfig()
-    vllm_config.parallel_config.tensor_parallel_size = world_size
-    vllm_config.parallel_config.enable_expert_parallel = True
+    parallel_config = ParallelConfig(
+        tensor_parallel_size=world_size,
+        enable_expert_parallel=True,
+        enable_eplb=True,
+    )
+    vllm_config = VllmConfig(parallel_config=parallel_config)
 
     with set_current_vllm_config(vllm_config):
         ensure_model_parallel_initialized(
@@ -213,12 +218,19 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
         for lidx in range(test_config.num_layers):
             shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
 
+        communicator = create_eplb_communicator(
+            group_coordinator=get_eplb_group(),
+            backend=vllm_config.parallel_config.eplb_config.communicator,
+            expert_weights=rank_expert_weights[0],
+        )
+
         rearrange_expert_weights_inplace(
             indices,
             shuffled_indices,
             rank_expert_weights,
             ep_group,
             is_profile=False,
+            communicator=communicator,
         )
 
         num_local_experts = test_config.num_local_experts

diff --git a/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py b/tests/distributed/test_eplb_fused_moe_layer_dep_nvfp4.py
@@ -9,12 +9,14 @@
 import torch
 
 from tests.kernels.moe.utils import make_test_quant_config
-from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.distributed.eplb.eplb_communicator import create_eplb_communicator
 from vllm.distributed.eplb.eplb_state import EplbLayerState
 from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace
 from vllm.distributed.parallel_state import (
     ensure_model_parallel_initialized,
     get_dp_group,
+    get_eplb_group,
 )
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
@@ -59,7 +61,6 @@ def make_fused_moe_layer(
         intermediate_size=test_config.intermediate_size,
         prefix=f"dummy_layer_{layer_idx}",
         activation="silu",
-        is_act_and_mul=True,
         params_dtype=torch.bfloat16,
         quant_config=quant_config,
     )
@@ -85,21 +86,22 @@ def make_fused_moe_layer(
         per_act_token_quant=False,
     )
 
-    fml.w13_weight.data = w1_q
-    fml.w2_weight.data = w2_q
+    re = fml.routed_experts
+    re.w13_weight.data = w1_q
+    re.w2_weight.data = w2_q
 
-    fml.w2_input_scale.data = torch.randn_like(fml.w2_input_scale.data) / 5
-    fml.w13_input_scale.data = torch.randn_like(fml.w13_input_scale.data) / 5
-    fml.w2_weight_scale_2.data = torch.randn_like(fml.w2_weight_scale_2.data) / 5
-    fml.w13_weight_scale_2.data = torch.randn_like(fml.w13_weight_scale_2.data) / 5
-    fml.w2_weight_scale.data = (
-        torch.randn(fml.w2_weight_scale.data.shape, device=device) / 5
-    ).to(fml.w2_weight_scale.data.dtype)
-    fml.w13_weight_scale.data = (
-        torch.randn(fml.w13_weight_scale.data.shape, device=device) / 5
-    ).to(fml.w13_weight_scale.data.dtype)
+    re.w2_input_scale.data = torch.randn_like(re.w2_input_scale.data) / 5
+    re.w13_input_scale.data = torch.randn_like(re.w13_input_scale.data) / 5
+    re.w2_weight_scale_2.data = torch.randn_like(re.w2_weight_scale_2.data) / 5
+    re.w13_weight_scale_2.data = torch.randn_like(re.w13_weight_scale_2.data) / 5
+    re.w2_weight_scale.data = (
+        torch.randn(re.w2_weight_scale.data.shape, device=device) / 5
+    ).to(re.w2_weight_scale.data.dtype)
+    re.w13_weight_scale.data = (
+        torch.randn(re.w13_weight_scale.data.shape, device=device) / 5
+    ).to(re.w13_weight_scale.data.dtype)
 
-    nvfp4_fused_moe.process_weights_after_loading(fml)
+    nvfp4_fused_moe.process_weights_after_loading(fml.routed_experts)
 
     fml.maybe_init_modular_kernel()
 
@@ -109,9 +111,12 @@ def make_fused_moe_layer(
 def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
     set_env_vars_and_device(env)
 
-    vllm_config = VllmConfig()
-    vllm_config.parallel_config.data_parallel_size = world_size
-    vllm_config.parallel_config.enable_expert_parallel = True
+    parallel_config = ParallelConfig(
+        data_parallel_size=world_size,
+        enable_expert_parallel=True,
+        enable_eplb=True,
+    )
+    vllm_config = VllmConfig(parallel_config=parallel_config)
 
     with set_current_vllm_config(vllm_config):
         ensure_model_parallel_initialized(
@@ -171,12 +176,19 @@ def _test_eplb_fml(env, world_size: int, test_config: TestConfig):
         for lidx in range(test_config.num_layers):
             shuffled_indices[lidx] = torch.randperm(test_config.num_experts)
 
+        communicator = create_eplb_communicator(
+            group_coordinator=get_eplb_group(),
+            backend=vllm_config.parallel_config.eplb_config.communicator,
+            expert_weights=rank_expert_weights[0],
+        )
+
         rearrange_expert_weights_inplace(
             indices,
             shuffled_indices,
             rank_expert_weights,
             ep_group,
             is_profile=False,
+            communicator=communicator,
         )
 
         num_global_experts = test_config.num_experts

@@ -618,7 +618,7 @@ def make_modular_kernel(
         num_experts=config.E,
         experts_per_token=config.topk,
         hidden_dim=config.K,
-        intermediate_size_per_partition=config.N,
+        intermediate_size=config.N,
         num_local_experts=config.num_local_experts,
         num_logical_experts=config.E,
         moe_parallel_config=moe_parallel_config,

@@ -79,6 +79,7 @@ def _worker_parallel_launch(
     rank = node_rank * world_local_size + local_rank
     device = torch.device("cuda", local_rank)
     torch.accelerator.set_device_index(device)
+    torch.set_default_device(device)
     torch.distributed.init_process_group(
         backend="cpu:gloo,cuda:nccl",
         init_method=init_method,
@@ -116,6 +117,7 @@ def _worker_parallel_launch(
         traceback.print_exc()
         raise
     finally:
+        torch.accelerator.synchronize()
         if vllm_config is not None:
             cleanup_dist_env_and_memory()
         else:

@@ -200,7 +200,7 @@ def slice_experts():
         moe_config = make_dummy_moe_config(
             num_experts=w2.shape[0],
             hidden_dim=w2.shape[1],
-            intermediate_size_per_partition=w2.shape[2],
+            intermediate_size=w2.shape[2],
             in_dtype=a.dtype,
         )
         kernel = mk.FusedMoEKernel(
@@ -269,7 +269,7 @@ def run_8_bit(
         moe_config = make_dummy_moe_config(
             num_experts=moe_tensors.w2_q.shape[0],  # type: ignore[union-attr]
             hidden_dim=moe_tensors.w2_q.shape[1],  # type: ignore[union-attr]
-            intermediate_size_per_partition=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
+            intermediate_size=moe_tensors.w2_q.shape[2],  # type: ignore[union-attr]
             in_dtype=moe_tensors.a.dtype,
         )
         kernel = mk.FusedMoEKernel(

@@ -166,12 +166,11 @@ def make_moe_tensors_8bit(
             num_experts=e,
             experts_per_token=topk,
             hidden_dim=k,
-            intermediate_size_per_partition=n,
+            intermediate_size=n,
             num_local_experts=e,
             num_logical_experts=e,
             moe_parallel_config=layer.moe_parallel_config,
             in_dtype=hidden_states.dtype,
-            is_act_and_mul=is_gated,
             routing_method=layer.routing_method_type,
             activation=activation,
             device=w13_quantized.device,
@@ -339,14 +338,13 @@ def get_fused_moe_quant_config(n: torch.nn.Module) -> FusedMoEQuantConfig:
             num_experts=e,
             experts_per_token=topk,
             hidden_dim=k,
-            intermediate_size_per_partition=n,
+            intermediate_size=n,
             num_local_experts=e,
             num_logical_experts=e,
             activation=activation,
             device="cuda",
             moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
             in_dtype=torch.bfloat16,
-            is_act_and_mul=activation.is_gated,
             routing_method=RoutingMethodType.TopK,
             max_num_tokens=next_power_of_2(m),
         )

@@ -166,7 +166,7 @@ def test_flashinfer_b12x_moe(
             num_experts=e,
             experts_per_token=topk,
             hidden_dim=k,
-            intermediate_size_per_partition=n,
+            intermediate_size=n,
             in_dtype=dtype,
         )
 

@@ -97,14 +97,13 @@ def test_flashinfer_fp4_moe_no_graph(
             num_experts=e,
             experts_per_token=topk,
             hidden_dim=k,
-            intermediate_size_per_partition=n,
+            intermediate_size=n,
             num_local_experts=e,
             num_logical_experts=e,
             activation=activation,
             device="cuda",
             moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
             in_dtype=dtype,
-            is_act_and_mul=is_gated_act,
             routing_method=RoutingMethodType.TopK,
             max_num_tokens=next_power_of_2(m),
         )

@@ -1617,14 +1617,13 @@ def test_unquantized_bf16_flashinfer_trtllm_backend(
         num_experts=e,
         experts_per_token=topk,
         hidden_dim=k,
-        intermediate_size_per_partition=n,
+        intermediate_size=n,
         num_local_experts=e,
         num_logical_experts=e,
         activation=MoEActivation.SILU,
         device="cuda",
         moe_parallel_config=FusedMoEParallelConfig.make_no_parallel(),
         in_dtype=dtype,
-        is_act_and_mul=True,
         routing_method=RoutingMethodType.Renormalize,
         max_num_tokens=next_power_of_2(m),
     )