add triton moe fall back by env var (vllm-project#20)

jikunshang · web-flow · commit c8c6268ac452 · 2025-11-12T12:54:45.000+08:00
Signed-off-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -225,6 +225,7 @@
     VLLM_FLATTEN_LOGPROBS: bool = False
     VLLM_XPU_USE_W8A8_GEMM: bool = False
     VLLM_XPU_ATTN_HEAD_SIZE_PAD: bool = False
+    VLLM_XPU_MOE_USE_TRITON: bool = False
 
 
 def get_default_cache_root():
@@ -1490,6 +1491,9 @@ def get_vllm_port() -> int | None:
     "VLLM_XPU_ATTN_HEAD_SIZE_PAD": lambda: bool(
         int(os.getenv("VLLM_XPU_ATTN_HEAD_SIZE_PAD", "0"))
     ),
+    "VLLM_XPU_MOE_USE_TRITON": lambda: bool(
+        int(os.getenv("VLLM_XPU_MOE_USE_TRITON", "0"))
+    ),
 }
 
 # --8<-- [end:env-vars-definition]
@@ -1618,6 +1622,7 @@ def compute_hash() -> str:
         "VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL",
         "VLLM_XPU_USE_W8A8_GEMM",
         "VLLM_XPU_ATTN_HEAD_SIZE_PAD",
+        "VLLM_XPU_MOE_USE_TRITON",
     ]
     for key in environment_variables_to_hash:
         # if this goes out of sync with environment_variables,
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -78,8 +78,15 @@
         )
 else:
     fused_experts = None  # type: ignore
+
     FusedMoEPermuteExpertsUnpermute = object  # type: ignore
     FusedMoEPrepareAndFinalize = object  # type: ignore
+    if envs.VLLM_XPU_MOE_USE_TRITON:
+        from .fused_moe import (
+            TritonExperts,
+            eplb_map_to_physical_and_record,
+            fused_experts,
+        )
     from vllm_xpu_kernels.fused_moe_interface import xpu_fused_moe
 
     def _eplb_map_to_physical_and_record(
@@ -908,6 +915,26 @@ def forward_xpu(
             or logical_replica_count is not None
         ):
             raise NotImplementedError("Expert load balancing is not supported for XPU.")
+        if envs.VLLM_XPU_MOE_USE_TRITON:
+            return self.forward_cuda(
+                layer,
+                x,
+                use_grouped_topk,
+                top_k,
+                router_logits,
+                renormalize,
+                topk_group,
+                num_expert_group,
+                global_num_experts,
+                expert_map,
+                custom_routing_function,
+                scoring_func,
+                routed_scaling_factor,
+                e_score_correction_bias,
+                apply_router_weight_on_input,
+                activation,
+            )
+
         M, _ = x.size()
         routing_weights = torch.empty(M, top_k, dtype=torch.float32, device=x.device)
         selected_experts = torch.empty(M, top_k, dtype=torch.int32, device=x.device)
@@ -1009,7 +1036,9 @@ def forward_tpu(
     elif current_platform.is_cpu():
         forward_native = forward_cpu
     elif current_platform.is_xpu():
-        forward_native = forward_xpu
+        forward_native = (
+            forward_xpu if not envs.VLLM_XPU_MOE_USE_TRITON else forward_cuda
+        )
     else:
         forward_native = forward_cuda