vllm-project · bnellnm · Jan 13, 2026 · Feb 12, 2026 · Feb 24, 2026 · Feb 24, 2026
@@ -0,0 +1,10 @@
+model_name: "meituan-longcat/LongCat-Flash-Chat-FP8"
+accuracy_threshold: 0.70
+num_questions: 1319
+num_fewshot: 5
+startup_max_wait_seconds: 1200
+server_args: >-
+  --enforce-eager
+  --max-model-len 4096
+  --tensor-parallel-size 8
+  --enable-expert-parallel
@@ -12,4 +12,5 @@ Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml
 Llama-4-Scout-Fp8-ModelOpt-marlin.yaml
 Llama-4-Scout-Fp8-ModelOpt-triton.yaml
 Qwen3-30B-A3B-BF16-fi-cutlass.yaml
-Qwen3-30B-A3B-BF16-triton.yaml
+Qwen3-30B-A3B-BF16-triton.yaml
+LongCat-Flash-Chat-FP8.yaml
@@ -253,14 +253,14 @@ def expert_info(kind) -> ExpertInfo:
 
 if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
     from vllm.model_executor.layers.fused_moe.flashinfer_a2a_prepare_finalize import (  # noqa: E501
-        FlashInferCutlassMoEPrepareAndFinalize,
+        FlashInferA2APrepareAndFinalize,
     )
     from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
         FlashInferExperts,
     )
 
     register_prepare_and_finalize(
-        FlashInferCutlassMoEPrepareAndFinalize,
+        FlashInferA2APrepareAndFinalize,
         standard_format,
         nvfp4_types + fp8_types,
         blocked_quantization_support=True,

@@ -0,0 +1,280 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for FusedMoE with zero experts.
+
+Verifies that:
+- The ZeroExpertRouter is properly created and used as the layer router.
+- A forward pass through FusedMoE with zero experts produces correct output.
+- The output decomposes correctly into real expert + zero expert contributions.
+"""
+
+import pytest
+import torch
+
+from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+from vllm.model_executor.layers.fused_moe.router.zero_expert_router import (
+    ZeroExpertRouter,
+)
+from vllm.v1.worker.workspace import init_workspace_manager
+
+
+@pytest.fixture
+def zero_expert_moe(dist_init, default_vllm_config):
+    """Create a FusedMoE layer with zero experts."""
+    num_experts = 4
+    top_k = 2
+    # hidden_size must be >= 256 for the zero expert identity kernel to
+    # produce output (its BLOCK_SIZE=256 causes grid=0 when hidden_dim<256).
+    hidden_size = 256
+    intermediate_size = 512
+    zero_expert_num = 1
+
+    e_score_correction_bias = torch.zeros(
+        num_experts + zero_expert_num,
+        dtype=torch.float32,
+        device="cuda",
+    )
+
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config.static_forward_context = dict()
+
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
+        init_workspace_manager(torch.cuda.current_device())
+
+        layer = FusedMoE(
+            zero_expert_type="identity",
+            e_score_correction_bias=e_score_correction_bias,
+            num_experts=num_experts,
+            top_k=top_k,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            params_dtype=torch.bfloat16,
+            prefix="test_zero_expert_moe",
+            renormalize=False,
+            routed_scaling_factor=1.0,
+            scoring_func="softmax",
+        ).cuda()
+
+        layer.quant_method.process_weights_after_loading(layer)
+
+        yield layer, vllm_config
+
+
+@pytest.mark.parametrize("num_tokens", [1, 32])
+def test_zero_expert_moe_router_is_zero_expert_router(zero_expert_moe, num_tokens):
+    """Verify that FusedMoE with zero_expert_type creates a ZeroExpertRouter."""
+    layer, _ = zero_expert_moe
+    assert isinstance(layer.router, ZeroExpertRouter), (
+        f"Expected ZeroExpertRouter but got {type(layer.router).__name__}."
+    )
+
+
+@pytest.mark.parametrize("num_tokens", [1, 32])
+def test_zero_expert_moe_no_custom_routing_fn(zero_expert_moe, num_tokens):
+    """Verify that custom_routing_function is not set (routing is handled
+    by ZeroExpertRouter, not a memoizing closure)."""
+    layer, _ = zero_expert_moe
+    assert layer.custom_routing_function is None
+
+
+@pytest.mark.parametrize("num_tokens", [1, 32])
+def test_zero_expert_moe_forward(zero_expert_moe, num_tokens):
+    """Run a forward pass through FusedMoE with zero experts and verify output shape."""
+    layer, vllm_config = zero_expert_moe
+
+    hidden_size = layer.hidden_size
+    num_experts = 4
+    zero_expert_num = 1
+    total_experts = num_experts + zero_expert_num
+
+    hidden_states = torch.randn(
+        num_tokens, hidden_size, dtype=torch.bfloat16, device="cuda"
+    )
+    router_logits = torch.randn(
+        num_tokens, total_experts, dtype=torch.float32, device="cuda"
+    )
+
+    # Initialize weights to small random values to avoid NaN from
+    # uninitialized memory.
+    with torch.no_grad():
+        for param in layer.parameters():
+            if param.dtype.is_floating_point:
+                param.normal_(0, 0.01)
+
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
+        get_forward_context().all_moe_layers = None
+        output = layer.forward(hidden_states, router_logits)
+
+    assert output.shape == hidden_states.shape, (
+        f"Expected output shape {hidden_states.shape}, got {output.shape}"
+    )
+    assert output.dtype == hidden_states.dtype
+    assert not torch.isnan(output).any(), "Output contains NaN values"
+
+
+@pytest.mark.parametrize("num_tokens", [1, 32])
+def test_zero_expert_moe_output_decomposition(zero_expert_moe, num_tokens):
+    """Validate that the FusedMoE output equals a plain FusedMoE
+    output (real experts only) plus the zero expert contribution.
+
+    The key invariant is:
+        zero_layer.forward(h, r_full) == plain_layer.forward(h, r_real)
+                                         + zero_expert_output
+
+    We create a plain FusedMoE layer with the same weights and real-expert-only
+    router logits, compute the zero expert output via the ZeroExpertRouter, and
+    verify the sum matches the FusedMoE output.
+    """
+    layer, vllm_config = zero_expert_moe
+    num_experts = 4
+    zero_expert_num = 1
+    total_experts = num_experts + zero_expert_num
+
+    hidden_states = torch.randn(
+        num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda"
+    )
+    router_logits = torch.randn(
+        num_tokens, total_experts, dtype=torch.float32, device="cuda"
+    )
+
+    with torch.no_grad():
+        for param in layer.parameters():
+            if param.dtype.is_floating_point:
+                param.normal_(0, 0.01)
+
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
+        get_forward_context().all_moe_layers = None
+
+        # Create a plain FusedMoE layer with the same config but no zero
+        # experts. Use a separate prefix to avoid collision.
+        plain_layer = FusedMoE(
+            num_experts=num_experts,
+            top_k=layer.top_k,
+            hidden_size=layer.hidden_size,
+            intermediate_size=layer.intermediate_size_per_partition,
+            params_dtype=torch.bfloat16,
+            prefix="test_zero_expert_moe_plain",
+            renormalize=False,
+            scoring_func="softmax",
+            e_score_correction_bias=layer.e_score_correction_bias,
+        ).cuda()
+
+        # Share weights from the zero expert layer.
+        plain_layer.w13_weight.data.copy_(layer.w13_weight.data)
+        plain_layer.w2_weight.data.copy_(layer.w2_weight.data)
+        plain_layer.quant_method.process_weights_after_loading(plain_layer)
+
+        # Compute routing via the ZeroExpertRouter. This produces masked
+        # topk_weights/topk_ids (zero expert entries have weight=0, id=0)
+        # and stores zero_expert_output as a side effect.
+        topk_weights, topk_ids = layer.router.select_experts(
+            hidden_states, router_logits
+        )
+        zero_output = layer.router.zero_expert_output
+
+        # Compute real expert output using the plain layer with the masked
+        # routing from the ZeroExpertRouter.
+        real_output = plain_layer.quant_method.apply(
+            layer=plain_layer,
+            x=hidden_states,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            shared_experts_input=None,
+        )
+
+        # Get the combined output from the zero expert layer.
+        full_output = layer.forward(hidden_states, router_logits)
+
+    assert zero_output is not None, "Zero expert output should not be None"
+    assert not torch.isnan(real_output).any(), "Real expert output has NaN"
+    assert not torch.isnan(zero_output).any(), "Zero expert output has NaN"
+    assert not torch.isnan(full_output).any(), "Full output has NaN"
+
+    expected = real_output + zero_output
+    torch.testing.assert_close(
+        full_output,
+        expected,
+        atol=0,
+        rtol=0,
+        msg="FusedMoE output should equal plain FusedMoE output "
+        "plus zero expert contribution",
+    )
+
+
+@pytest.mark.parametrize("num_tokens", [1, 32])
+def test_zero_expert_moe_zero_expert_is_identity(zero_expert_moe, num_tokens):
+    """Validate zero expert identity behavior.
+
+    When routing strongly favors the zero expert, its contribution should
+    be a scaled version of hidden_states (identity operation). We verify
+    this by manually computing the expected zero expert output from the
+    routing weights and comparing against what the router produces.
+    """
+    layer, vllm_config = zero_expert_moe
+    num_experts = 4
+    zero_expert_num = 1
+    total_experts = num_experts + zero_expert_num
+
+    hidden_states = torch.randn(
+        num_tokens, layer.hidden_size, dtype=torch.bfloat16, device="cuda"
+    )
+    # Strongly bias toward the zero expert (index 4).
+    router_logits = torch.full(
+        (num_tokens, total_experts), -10.0, dtype=torch.float32, device="cuda"
+    )
+    router_logits[:, num_experts] = 10.0  # zero expert gets high logit
+
+    with torch.no_grad():
+        for param in layer.parameters():
+            if param.dtype.is_floating_point:
+                param.normal_(0, 0.01)
+
+    with set_current_vllm_config(vllm_config), set_forward_context(None, vllm_config):
+        get_forward_context().all_moe_layers = None
+
+        # Run routing to get topk_weights/topk_ids before masking.
+        from vllm.model_executor.layers.fused_moe.router.fused_topk_bias_router import (
+            fused_topk_bias,
+        )
+
+        topk_weights, topk_ids = fused_topk_bias(
+            hidden_states=hidden_states,
+            gating_output=router_logits,
+            e_score_correction_bias=layer.router.e_score_correction_bias.data,
+            topk=layer.top_k,
+            renormalize=layer.router.renormalize,
+            scoring_func=layer.router.scoring_func,
+        )
+
+        # Manually compute expected zero expert identity output:
+        # For each token, sum routing weights assigned to zero expert slots,
+        # then multiply by hidden_states.
+        zero_mask = topk_ids >= num_experts
+        zero_weight_per_token = (topk_weights * zero_mask.float()).sum(
+            dim=-1, keepdim=True
+        )
+        expected_zero_output = (hidden_states.float() * zero_weight_per_token).to(
+            hidden_states.dtype
+        )
+
+        # Run routing directly to trigger zero expert computation
+        # without going through the runner (which consumes the output).
+        layer.router.select_experts(hidden_states, router_logits)
+        actual_zero_output = layer.router.zero_expert_output
+
+    assert actual_zero_output is not None
+    assert zero_mask.any(), (
+        "With high zero expert logit, at least some slots should route "
+        "to the zero expert"
+    )
+
+    torch.testing.assert_close(
+        actual_zero_output,
+        expected_zero_output,
+        atol=1e-3,
+        rtol=1e-3,
+        msg="Zero expert identity output should equal "
+        "hidden_states * sum(zero_expert_weights)",
+    )
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
@@ -113,6 +113,7 @@ pushd "$WORKSPACE"
 echo "Downloading NVSHMEM ${NVSHMEM_VER} for ${NVSHMEM_SUBDIR} ..."
 curl -fSL "${NVSHMEM_URL}" -o "${NVSHMEM_FILE}"
 tar -xf "${NVSHMEM_FILE}"
+rm -rf nvshmem
 mv "${NVSHMEM_FILE%.tar.xz}" nvshmem
 rm -f "${NVSHMEM_FILE}"
 rm -rf nvshmem/lib/bin nvshmem/lib/share

@@ -144,6 +144,8 @@ def _inject_lora_into_fused_moe(self):
                 self.base_layer.shared_experts,
             )
 
+        # TODO: could be incorrect due to monolithic kernel? or add assert it
+        # is modular?
         if quant_config.use_mxfp4_w4a16:
             assert isinstance(
                 m_fused_moe_fn.fused_experts, (MarlinExperts, UnfusedOAITritonExperts)
@@ -331,6 +333,8 @@ def wrapper(*args, **kwargs):
 
         fused_experts = m_fused_moe_fn.fused_experts
 
+        # TODO: seems like this could be done with modular kernel subclasses?
+
         m_fused_moe_fn.forward = fwd_decorator(self.base_layer, m_fused_moe_fn.forward)
         fused_experts.activation = act_decorator(
             self.base_layer, fused_experts.activation
@@ -585,10 +589,6 @@ def forward(self, *args, **kwargs):
     def maybe_all_reduce_tensor_model_parallel(self, *args, **kwargs):
         return self.base_layer.maybe_all_reduce_tensor_model_parallel(*args, **kwargs)
 
-    @property
-    def _shared_experts(self):
-        return self.base_layer._shared_experts
-
     @property
     def quant_method(self):
         return self.base_layer.quant_method

@@ -32,9 +32,6 @@
 from vllm.model_executor.layers.fused_moe.unquantized_fused_moe_method import (
     UnquantizedFusedMoEMethod,
 )
-from vllm.model_executor.layers.fused_moe.zero_expert_fused_moe import (
-    ZeroExpertFusedMoE,
-)
 from vllm.triton_utils import HAS_TRITON
 
 _config: dict[str, Any] | None = None
@@ -66,7 +63,6 @@ def get_config() -> dict[str, Any] | None:
     "FusedMoEPrepareAndFinalize",
     "RoutingMethodType",
     "SharedFusedMoE",
-    "ZeroExpertFusedMoE",
     "activation_without_mul",
     "apply_moe_activation",
     "override_config",

@@ -927,6 +927,15 @@ class FusedMoEParallelConfig:
     all2all_backend: str  # all2all backend for MoE communication
     enable_eplb: bool  # whether to enable expert load balancing
 
+    @property
+    def use_dp_chunking(self) -> bool:
+        return (
+            self.use_pplx_kernels
+            or self.use_deepep_ll_kernels
+            or self.use_mori_kernels
+            or self.use_fi_all2allv_kernels
+        ) and envs.VLLM_ENABLE_MOE_DP_CHUNK
+
     @property
     def is_sequence_parallel(self) -> bool:
         return self.sp_size > 1

@@ -1186,6 +1186,7 @@ def cutlass_moe_w4a8_fp8(
             quant_config=quant_config,
             group_size=group_size,
         ),
+        shared_experts=None,
         inplace=False,
     )