diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 45bf4bcac6a8..bc28db80407f 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -15,7 +15,7 @@ class DummyRouter(BaseRouter): @property def routing_method_type(self) -> RoutingMethodType: - return RoutingMethodType.FUSED_TOPK + return RoutingMethodType.TopK def _compute_routing(self, hidden_states, router_logits, indices_type): topk_ids = torch.tensor([[1, 2], [3, 4]], dtype=torch.int64) @@ -158,3 +158,26 @@ def capture(self, layer_id, topk_ids): assert callable(dummy_module.router.capture_fn) dummy_module.router.capture_fn(torch.tensor([[9, 10]])) assert len(capturer.calls) == 1 + + +@pytest.mark.parametrize( + "scoring_func,top_k,renormalize,expected", + [ + ("sigmoid", 1, False, RoutingMethodType.Llama4), + ("sigmoid", 2, False, RoutingMethodType.DeepSeekV3), + ("softmax", 2, False, RoutingMethodType.Default), + ("softmax", 2, True, RoutingMethodType.Renormalize), + ], +) +def test_routing_method_type_from_topk_mapping( + scoring_func, + top_k, + renormalize, + expected, +): + assert RoutingMethodType.from_topk(scoring_func, top_k, renormalize) == expected + + +def test_routing_method_type_from_topk_invalid_scoring_func(): + with pytest.raises(ValueError, match="Unsupported scoring function"): + RoutingMethodType.from_topk("none", 1, False) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 6650367da035..5bd06e10a421 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -123,6 +123,24 @@ class RoutingMethodType(IntEnum): # Unspecified Unspecified = 8.0 + @staticmethod + def from_topk( + scoring_func: str, + top_k: int, + renormalize: bool, + ) -> "RoutingMethodType": + if scoring_func == "sigmoid": + return ( + RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 + ) + if scoring_func == "softmax": + return ( + RoutingMethodType.Renormalize + if renormalize + else RoutingMethodType.Default + ) + raise ValueError(f"Unsupported scoring function: {scoring_func}") + @dataclass class FusedMoEQuantDesc: diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index 0182cfc195f9..0d59cee0ae5e 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -63,6 +63,7 @@ def _supports_routing_method( """Monolithic kernels need to express router support.""" if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym): # NOTE(rob): potentially allow others here. This is a conservative list. + # Default routing is not implemented in FlashInfer TRTLLM. return routing_method in [ RoutingMethodType.DeepSeekV3, RoutingMethodType.Renormalize, @@ -85,7 +86,6 @@ def _supports_routing_method_bf16( routing_method: RoutingMethodType, ) -> bool: return routing_method in [ - RoutingMethodType.Default, RoutingMethodType.Renormalize, RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 7c230686fa6e..860318ffbd65 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -158,10 +158,8 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return ( - RoutingMethodType.Renormalize - if not self.renormalize - else RoutingMethodType.RenormalizeNaive + return RoutingMethodType.from_topk( + self.scoring_func, self.top_k, self.renormalize ) def _compute_routing( diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index cec9240efecd..ba47736f5038 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -135,10 +135,8 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return ( - RoutingMethodType.Renormalize - if not self.renormalize - else RoutingMethodType.RenormalizeNaive + return RoutingMethodType.from_topk( + self.scoring_func, self.top_k, self.renormalize ) def _compute_routing( diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 4783ca5e0e05..adaa6360bce7 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -351,7 +351,7 @@ def flashinfer_trtllm_fp4_moe( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, routed_scaling_factor=None, - routing_method_type=routing_method_type, + routing_method_type=1, do_finalize=True, )[0]