Enable renormalize(naive) routing for fp8 per-tensor (#2030)

IwakuraRein · web-flow · commit fbdb4396a911 · 2025-11-10T18:19:50.000-08:00
## 📌 Description Disable expert weights in the FC1 except for Llama routing. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Bug Fixes** * Re-enabled Renormalize routing that was previously blocked. * Made token_scales available for Llama4 routing. * Corrected GEMM1 input so the proper data source is used during MoE processing. * **Tests** * Added FP8PerTensorMoe to test parameterization. * Expanded Renormalize and DeepSeekV3 test coverage and removed related skips.  Signed-off-by: siyuanf <siyuanf@nvidia.com>
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -584,6 +584,9 @@ class Fp8PerTensorLauncher : public FusedMoeLauncher {
         alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());
 
     workspace.expert_weights = expert_weights.data_ptr();
+    if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
+      workspace.token_scales = expert_weights.data_ptr();  // Consumed by permuteGemm1 kernel
+    }
   }
 
   void check_moe() const override {
diff --git a/csrc/trtllm_fused_moe_runner.cu b/csrc/trtllm_fused_moe_runner.cu
@@ -518,7 +518,7 @@ void Runner::run(MoERunnerArgs const& args, MoEWorkspace const& workspace, int d
   auto const& config = mPassingConfigs[configIndex];
 
   mPermuteGemm1.run(args.hidden_states, hidden_states_scale_linear, args.gemm1_weights,
-                    args.gemm1_weights_scale, workspace.expert_weights, args.output1_scales_scalar,
+                    args.gemm1_weights_scale, workspace.token_scales, args.output1_scales_scalar,
                     args.output1_scales_gate_scalar, args.gemm1_bias, args.gemm1_alpha,
                     args.gemm1_beta, args.gemm1_clamp_limit, workspace.gemm1_output,
                     workspace.gemm1_output_scale, args.top_k, args.hidden_size,
diff --git a/include/flashinfer/trtllm/fused_moe/runner.h b/include/flashinfer/trtllm/fused_moe/runner.h
@@ -305,7 +305,11 @@ struct MoEWorkspace {
   int32_t* expanded_idx_to_permuted_idx = nullptr;
   int32_t* permuted_idx_to_expanded_idx = nullptr;
   int32_t* permuted_idx_to_token_idx = nullptr;
+
+  // consumed by finalize kernel
   void* expert_weights = nullptr;  // [num_tokens, top_k] in bfloat16 = mDtypeExpW
+  // consumed by permuteGemm1 kernel
+  void* token_scales = nullptr;
 
   int32_t* cta_idx_xy_to_batch_idx = nullptr;
   int32_t* cta_idx_xy_to_mn_limit = nullptr;
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -2275,6 +2275,7 @@ def run_moe_test(
     [
         pytest.param(BF16Moe(), id="BF16xBF16"),
         pytest.param(FP8BlockScaleMoe(), id="FP8_Block"),
+        pytest.param(FP8PerTensorMoe(), id="FP8_Tensor"),
         pytest.param(FP4Moe(quant_mode=QuantMode.FP4_NVFP4_NVFP4), id="NvFP4xNvFP4"),
         pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_MXFP8), id="MxFP4xMxFP8"),
         pytest.param(FP4Moe(quant_mode=QuantMode.FP4_MXFP4_Bf16), id="MxFP4xBf16"),
@@ -2293,7 +2294,12 @@ def run_moe_test(
                 "routed_scaling": None,
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe, BF16Moe],
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                ],
                 "compatible_intermediate_size": [384, 768, 1024],
             },
             id="Qwen3",
@@ -2308,7 +2314,12 @@ def run_moe_test(
                 "routed_scaling": None,
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe, BF16Moe],
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                ],
                 "compatible_intermediate_size": [384, 1024],
             },
             id="Renorm",
@@ -2323,7 +2334,12 @@ def run_moe_test(
                 "routed_scaling": None,
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe, BF16Moe],
+                "compatible_moe_impls": [
+                    FP8PerTensorMoe,
+                    FP8BlockScaleMoe,
+                    FP4Moe,
+                    BF16Moe,
+                ],
                 "compatible_intermediate_size": [512],
             },
             id="Qwen3_next",

Original file line number	Diff line number	Diff line change
`@@ -584,6 +584,9 @@ class Fp8PerTensorLauncher : public FusedMoeLauncher {`
`584`	`584`	`alloc_tensor({args->num_tokens, args->top_k}, dl_bfloat16, hidden_states.device());`
`585`	`585`
`586`	`586`	`workspace.expert_weights = expert_weights.data_ptr();`
	`587`	`+ if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {`
	`588`	`+ workspace.token_scales = expert_weights.data_ptr(); // Consumed by permuteGemm1 kernel`
	`589`	`+ }`
`587`	`590`	`}`
`588`	`591`
`589`	`592`	`void check_moe() const override {`