From d1a026080b1179ab42f39bb2f5949a3e330e2886 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 14:22:17 -0500 Subject: [PATCH 1/7] fix Signed-off-by: baonudesifeizhai --- .../layers/fused_moe/flashinfer_trtllm_moe.py | 1 + .../fused_moe/router/fused_topk_bias_router.py | 18 +++++++++++++----- .../fused_moe/router/fused_topk_router.py | 18 +++++++++++++----- .../layers/quantization/mxfp4.py | 2 +- .../quantization/utils/flashinfer_fp4_moe.py | 2 +- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index 0182cfc195f9..01d44f6171a2 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -64,6 +64,7 @@ def _supports_routing_method( if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym): # NOTE(rob): potentially allow others here. This is a conservative list. return routing_method in [ + RoutingMethodType.Default, RoutingMethodType.DeepSeekV3, RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 7c230686fa6e..4014f8cc17a8 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -129,6 +129,16 @@ def fused_topk_bias( ) +def _resolve_sigmoid_routing_method(top_k: int) -> RoutingMethodType: + return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 + + +def _resolve_softmax_routing_method(renormalize: bool) -> RoutingMethodType: + return ( + RoutingMethodType.RenormalizeNaive if renormalize else RoutingMethodType.Default + ) + + class FusedTopKBiasRouter(BaseRouter): """Router using fused top-k with e_score_correction_bias.""" @@ -158,11 +168,9 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return ( - RoutingMethodType.Renormalize - if not self.renormalize - else RoutingMethodType.RenormalizeNaive - ) + if self.scoring_func == "sigmoid": + return _resolve_sigmoid_routing_method(self.top_k) + return _resolve_softmax_routing_method(self.renormalize) def _compute_routing( self, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index cec9240efecd..71680234c841 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -110,6 +110,16 @@ def fused_topk( raise ValueError(f"Unsupported scoring function: {scoring_func}") +def _resolve_sigmoid_routing_method(top_k: int) -> RoutingMethodType: + return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 + + +def _resolve_softmax_routing_method(renormalize: bool) -> RoutingMethodType: + return ( + RoutingMethodType.RenormalizeNaive if renormalize else RoutingMethodType.Default + ) + + class FusedTopKRouter(BaseRouter): """Default router using standard fused top-k routing.""" @@ -135,11 +145,9 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return ( - RoutingMethodType.Renormalize - if not self.renormalize - else RoutingMethodType.RenormalizeNaive - ) + if self.scoring_func == "sigmoid": + return _resolve_sigmoid_routing_method(self.top_k) + return _resolve_softmax_routing_method(self.renormalize) def _compute_routing( self, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index a50fa4beea34..7aac8e88258d 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1073,7 +1073,7 @@ def apply_monolithic( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=self.num_experts, routed_scaling_factor=None, - routing_method_type=1 if layer.renormalize else 0, + routing_method_type=layer.routing_method_type, do_finalize=True, tune_max_num_tokens=max(self.max_capture_size, 1), )[0] diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 4783ca5e0e05..448185c6a8a4 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -434,7 +434,7 @@ def flashinfer_trtllm_fp4_routed_moe( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, routed_scaling_factor=None, - routing_method_type=1, + routing_method_type=layer.routing_method_type, do_finalize=True, )[0] From b9eddaacd9226d38a770bf5382f73aa64280aa7a Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 14:46:23 -0500 Subject: [PATCH 2/7] change Signed-off-by: baonudesifeizhai --- .../router/fused_topk_bias_router.py | 19 ++++++------------- .../fused_moe/router/fused_topk_router.py | 19 ++++++------------- .../layers/fused_moe/router/routing_utils.py | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 26 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/router/routing_utils.py diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 4014f8cc17a8..6b054669efa4 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -12,6 +12,9 @@ ) from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +from vllm.model_executor.layers.fused_moe.router.routing_utils import ( + resolve_fused_topk_routing_method, +) def vllm_topk_softmax( @@ -129,16 +132,6 @@ def fused_topk_bias( ) -def _resolve_sigmoid_routing_method(top_k: int) -> RoutingMethodType: - return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 - - -def _resolve_softmax_routing_method(renormalize: bool) -> RoutingMethodType: - return ( - RoutingMethodType.RenormalizeNaive if renormalize else RoutingMethodType.Default - ) - - class FusedTopKBiasRouter(BaseRouter): """Router using fused top-k with e_score_correction_bias.""" @@ -168,9 +161,9 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - if self.scoring_func == "sigmoid": - return _resolve_sigmoid_routing_method(self.top_k) - return _resolve_softmax_routing_method(self.renormalize) + return resolve_fused_topk_routing_method( + self.scoring_func, self.top_k, self.renormalize + ) def _compute_routing( self, diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index 71680234c841..db12bc5f3899 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -9,6 +9,9 @@ from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter +from vllm.model_executor.layers.fused_moe.router.routing_utils import ( + resolve_fused_topk_routing_method, +) def vllm_topk_softmax( @@ -110,16 +113,6 @@ def fused_topk( raise ValueError(f"Unsupported scoring function: {scoring_func}") -def _resolve_sigmoid_routing_method(top_k: int) -> RoutingMethodType: - return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 - - -def _resolve_softmax_routing_method(renormalize: bool) -> RoutingMethodType: - return ( - RoutingMethodType.RenormalizeNaive if renormalize else RoutingMethodType.Default - ) - - class FusedTopKRouter(BaseRouter): """Default router using standard fused top-k routing.""" @@ -145,9 +138,9 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - if self.scoring_func == "sigmoid": - return _resolve_sigmoid_routing_method(self.top_k) - return _resolve_softmax_routing_method(self.renormalize) + return resolve_fused_topk_routing_method( + self.scoring_func, self.top_k, self.renormalize + ) def _compute_routing( self, diff --git a/vllm/model_executor/layers/fused_moe/router/routing_utils.py b/vllm/model_executor/layers/fused_moe/router/routing_utils.py new file mode 100644 index 000000000000..2e167c567230 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/router/routing_utils.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from vllm.model_executor.layers.fused_moe.config import RoutingMethodType + + +def resolve_fused_topk_routing_method( + scoring_func: str, + top_k: int, + renormalize: bool, +) -> RoutingMethodType: + if scoring_func == "sigmoid": + return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 + if scoring_func == "softmax": + return ( + RoutingMethodType.RenormalizeNaive + if renormalize + else RoutingMethodType.Default + ) + raise ValueError(f"Unsupported scoring function: {scoring_func}") From 32ba6eff9d06cc71028abf4d67e0b086911d3c02 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 14:50:18 -0500 Subject: [PATCH 3/7] fix Signed-off-by: baonudesifeizhai --- .../model_executor/layers/fused_moe/config.py | 18 ++++++++++++++++++ .../router/fused_topk_bias_router.py | 5 +---- .../fused_moe/router/fused_topk_router.py | 5 +---- .../layers/fused_moe/router/routing_utils.py | 19 ------------------- 4 files changed, 20 insertions(+), 27 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/router/routing_utils.py diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 6650367da035..e5f3c6cc8416 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -123,6 +123,24 @@ class RoutingMethodType(IntEnum): # Unspecified Unspecified = 8.0 + @staticmethod + def from_topk( + scoring_func: str, + top_k: int, + renormalize: bool, + ) -> "RoutingMethodType": + if scoring_func == "sigmoid": + return ( + RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 + ) + if scoring_func == "softmax": + return ( + RoutingMethodType.RenormalizeNaive + if renormalize + else RoutingMethodType.Default + ) + raise ValueError(f"Unsupported scoring function: {scoring_func}") + @dataclass class FusedMoEQuantDesc: diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py index 6b054669efa4..860318ffbd65 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_bias_router.py @@ -12,9 +12,6 @@ ) from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -from vllm.model_executor.layers.fused_moe.router.routing_utils import ( - resolve_fused_topk_routing_method, -) def vllm_topk_softmax( @@ -161,7 +158,7 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return resolve_fused_topk_routing_method( + return RoutingMethodType.from_topk( self.scoring_func, self.top_k, self.renormalize ) diff --git a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py index db12bc5f3899..ba47736f5038 100644 --- a/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py +++ b/vllm/model_executor/layers/fused_moe/router/fused_topk_router.py @@ -9,9 +9,6 @@ from vllm.distributed.eplb.eplb_state import EplbLayerState from vllm.model_executor.layers.fused_moe.config import RoutingMethodType from vllm.model_executor.layers.fused_moe.router.base_router import BaseRouter -from vllm.model_executor.layers.fused_moe.router.routing_utils import ( - resolve_fused_topk_routing_method, -) def vllm_topk_softmax( @@ -138,7 +135,7 @@ def __init__( @property def routing_method_type(self) -> RoutingMethodType: - return resolve_fused_topk_routing_method( + return RoutingMethodType.from_topk( self.scoring_func, self.top_k, self.renormalize ) diff --git a/vllm/model_executor/layers/fused_moe/router/routing_utils.py b/vllm/model_executor/layers/fused_moe/router/routing_utils.py deleted file mode 100644 index 2e167c567230..000000000000 --- a/vllm/model_executor/layers/fused_moe/router/routing_utils.py +++ /dev/null @@ -1,19 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.model_executor.layers.fused_moe.config import RoutingMethodType - - -def resolve_fused_topk_routing_method( - scoring_func: str, - top_k: int, - renormalize: bool, -) -> RoutingMethodType: - if scoring_func == "sigmoid": - return RoutingMethodType.Llama4 if top_k == 1 else RoutingMethodType.DeepSeekV3 - if scoring_func == "softmax": - return ( - RoutingMethodType.RenormalizeNaive - if renormalize - else RoutingMethodType.Default - ) - raise ValueError(f"Unsupported scoring function: {scoring_func}") From 63489066c9e7d7da164c01d499b7861341e428d6 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 15:14:51 -0500 Subject: [PATCH 4/7] fix Signed-off-by: baonudesifeizhai --- tests/model_executor/test_routed_experts_capture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 45bf4bcac6a8..66c3af7433b0 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -15,7 +15,7 @@ class DummyRouter(BaseRouter): @property def routing_method_type(self) -> RoutingMethodType: - return RoutingMethodType.FUSED_TOPK + return RoutingMethodType.TopK def _compute_routing(self, hidden_states, router_logits, indices_type): topk_ids = torch.tensor([[1, 2], [3, 4]], dtype=torch.int64) From 3b778a041af8b539bc05c819164f612a4890c9ed Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 16:23:38 -0500 Subject: [PATCH 5/7] fix Signed-off-by: baonudesifeizhai --- .../test_routed_experts_capture.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/model_executor/test_routed_experts_capture.py b/tests/model_executor/test_routed_experts_capture.py index 66c3af7433b0..bc28db80407f 100644 --- a/tests/model_executor/test_routed_experts_capture.py +++ b/tests/model_executor/test_routed_experts_capture.py @@ -158,3 +158,26 @@ def capture(self, layer_id, topk_ids): assert callable(dummy_module.router.capture_fn) dummy_module.router.capture_fn(torch.tensor([[9, 10]])) assert len(capturer.calls) == 1 + + +@pytest.mark.parametrize( + "scoring_func,top_k,renormalize,expected", + [ + ("sigmoid", 1, False, RoutingMethodType.Llama4), + ("sigmoid", 2, False, RoutingMethodType.DeepSeekV3), + ("softmax", 2, False, RoutingMethodType.Default), + ("softmax", 2, True, RoutingMethodType.Renormalize), + ], +) +def test_routing_method_type_from_topk_mapping( + scoring_func, + top_k, + renormalize, + expected, +): + assert RoutingMethodType.from_topk(scoring_func, top_k, renormalize) == expected + + +def test_routing_method_type_from_topk_invalid_scoring_func(): + with pytest.raises(ValueError, match="Unsupported scoring function"): + RoutingMethodType.from_topk("none", 1, False) From 524d29564d14c238387a4ed332c3c97a5e18cef5 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Wed, 4 Feb 2026 16:45:48 -0500 Subject: [PATCH 6/7] fix Signed-off-by: baonudesifeizhai --- vllm/model_executor/layers/fused_moe/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index e5f3c6cc8416..5bd06e10a421 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -135,7 +135,7 @@ def from_topk( ) if scoring_func == "softmax": return ( - RoutingMethodType.RenormalizeNaive + RoutingMethodType.Renormalize if renormalize else RoutingMethodType.Default ) From 589de3f486f95a9541fe3320e5577b3bc8115298 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Fri, 6 Feb 2026 14:34:27 -0500 Subject: [PATCH 7/7] fix Signed-off-by: baonudesifeizhai --- vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py | 3 +-- vllm/model_executor/layers/quantization/mxfp4.py | 2 +- .../layers/quantization/utils/flashinfer_fp4_moe.py | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py index 01d44f6171a2..0d59cee0ae5e 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_trtllm_moe.py @@ -63,8 +63,8 @@ def _supports_routing_method( """Monolithic kernels need to express router support.""" if (weight_key, activation_key) == (kFp8Static128BlockSym, kFp8Dynamic128Sym): # NOTE(rob): potentially allow others here. This is a conservative list. + # Default routing is not implemented in FlashInfer TRTLLM. return routing_method in [ - RoutingMethodType.Default, RoutingMethodType.DeepSeekV3, RoutingMethodType.Renormalize, RoutingMethodType.RenormalizeNaive, @@ -86,7 +86,6 @@ def _supports_routing_method_bf16( routing_method: RoutingMethodType, ) -> bool: return routing_method in [ - RoutingMethodType.Default, RoutingMethodType.Renormalize, RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 7aac8e88258d..a50fa4beea34 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1073,7 +1073,7 @@ def apply_monolithic( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=self.num_experts, routed_scaling_factor=None, - routing_method_type=layer.routing_method_type, + routing_method_type=1 if layer.renormalize else 0, do_finalize=True, tune_max_num_tokens=max(self.max_capture_size, 1), )[0] diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py index 448185c6a8a4..adaa6360bce7 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_fp4_moe.py @@ -351,7 +351,7 @@ def flashinfer_trtllm_fp4_moe( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, routed_scaling_factor=None, - routing_method_type=routing_method_type, + routing_method_type=1, do_finalize=True, )[0] @@ -434,7 +434,7 @@ def flashinfer_trtllm_fp4_routed_moe( local_expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, routed_scaling_factor=None, - routing_method_type=layer.routing_method_type, + routing_method_type=1, do_finalize=True, )[0]