update code

jiahanc · jiahanc · commit 1b12e88219ee · 2025-10-13T19:04:42.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -169,7 +169,7 @@ void TrtllmGenBatchedGemmRunner::run(
   auto const configs = bmm.getBatchedGemmConfigs();
 
   auto const& config = configs[configIndex];
-
+  std::cout << "config.mFunctionName: " << config.mFunctionName << std::endl;
   FLASHINFER_CHECK(numBatches > 0, "Batched GEMM requires numBatches > 0");
   if (!mOptions.staticBatch) {
     FLASHINFER_CHECK(totalNumPaddedTokens,
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -41,9 +41,9 @@ Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
     Tensor routing_logits, Optional<Tensor> routing_bias, Tensor hidden_states,
     Tensor gemm1_weights, Tensor output1_scales_scalar, Tensor output1_scales_gate_scalar,
     Tensor gemm2_weights, Tensor output2_scales_scalar, int64_t const num_experts,
-    int64_t const top_k, int64_t const n_group, int64_t const topk_group,
+    int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
     int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, double const routed_scaling_factor,
+    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
     bool const use_routing_scales_on_input, int64_t const tile_tokens_dim,
     int64_t const routing_method_type, bool enable_pdl) {
   static const std::tuple<int, int> device_props = [hidden_states] {
@@ -61,8 +61,11 @@ Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
 
   if (use_routing_scales_on_input) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
-  } else {
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::DeepSeekV3) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  } else {
+    // TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
   }
   TVM_FFI_ICHECK_EQ(routing_logits->ndim, 2) << "routing_logits must be 2D.";
   TVM_FFI_ICHECK_EQ(routing_logits->shape[1], num_experts) << "routing_logits has incorrect shape.";
@@ -73,17 +76,32 @@ Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
         << "routing_bias has incorrect shape.";
   }
 
-  if (n_group <= 0 || topk_group <= 0) {
-    TVM_FFI_ICHECK_EQ(top_k, 1) << "Current routing kernel (no groups) only supports top_k=1.";
-  } else {
-    TVM_FFI_ICHECK_LE(top_k, 8) << "Current routing kernel (with groups) only supports top_k<=8.";
-    TVM_FFI_ICHECK_LE(topk_group, 4)
-        << "Current routing kernel (with groups) only supports topk_group<=4.";
-    TVM_FFI_ICHECK_LE(topk_group, n_group) << "n_group must not be smaller than topk_group.";
-    TVM_FFI_ICHECK_EQ(num_experts % n_group, 0) << "num_experts must be divisible by n_group";
+  if (n_group.has_value() && n_group.value() != 0) {
+    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::DeepSeekV3)
+        << "Routing kernel with groups implies DeepSeekV3 routing method.";
+    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
+    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
+        << "num_experts must be divisible by n_group";
+    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
+        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
+    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
+        << "n_group must not be smaller than topk_group.";
     // This check ensures we have enough experts in the selected groups to handle the top_k routing
-    TVM_FFI_ICHECK_LT(top_k, (topk_group * num_experts / n_group))
+    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
         << "top_k must be less than total number of experts in selected groups";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::Renormalize ||
+             static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::RenormalizeNaive) {
+    TVM_FFI_LOG_AND_THROW(NotImplementedError)
+        << "Don't support routing method type Renormalize(Naive).";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::Llama4) {
+    TVM_FFI_ICHECK_EQ(top_k, 1)
+        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
   }
   TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
       << "Routing kernel expects that num_experts must be divisible by 4";
@@ -121,11 +139,11 @@ Tensor trtllm_fp8_per_tensor_scale_moe_launcher(
   args.hidden_size = hidden_states->shape[1];
   args.hidden_size_output = args.hidden_size;
   args.top_k = top_k;
-  args.n_group = n_group;
-  args.topk_group = topk_group;
+  args.n_group = n_group.has_value() ? n_group.value() : 0;
+  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor;
+  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseRoutingScalesOnInput = use_routing_scales_on_input;
 
@@ -279,8 +297,8 @@ Tensor trtllm_fp8_per_tensor_scale_moe(
     Tensor routing_logits, Optional<Tensor> routing_bias, Tensor hidden_states,
     Tensor gemm1_weights, Tensor output1_scales_scalar, Tensor output1_scales_gate_scalar,
     Tensor gemm2_weights, Tensor output2_scales_scalar, int64_t num_experts, int64_t top_k,
-    int64_t n_group, int64_t topk_group, int64_t intermediate_size, int64_t local_expert_offset,
-    int64_t local_num_experts, double routed_scaling_factor, bool use_routing_scales_on_input,
+    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size, int64_t local_expert_offset,
+    int64_t local_num_experts, Optional<double> routed_scaling_factor, bool use_routing_scales_on_input,
     int64_t tile_tokens_dim, int64_t routing_method_type, bool enable_pdl) {
   auto dtype = hidden_states->dtype;
   if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
@@ -299,9 +317,9 @@ void trtllm_fp8_block_scale_moe_launcher(
     Tensor routing_logits, Optional<Tensor> routing_bias, Tensor hidden_states,
     Tensor hidden_states_scale, Tensor gemm1_weights, Tensor gemm1_weights_scale,
     Tensor gemm2_weights, Tensor gemm2_weights_scale, Tensor output, int64_t const num_experts,
-    int64_t const top_k, int64_t const n_group, int64_t const topk_group,
+    int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
     int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, double const routed_scaling_factor,
+    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
     int64_t const tile_tokens_dim, int64_t const routing_method_type,
     tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner& moe_runner, int64_t moeConfigIndex,
     bool enable_pdl) {
@@ -318,7 +336,11 @@ void trtllm_fp8_block_scale_moe_launcher(
       << "This kernel requires 10.x architecture. Current device has SM "
       << std::get<0>(device_props) << std::get<1>(device_props);
 
-  TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
+    TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
+  } else {
+    TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
+  }
   TVM_FFI_ICHECK_EQ(routing_logits->ndim, 2) << "routing_logits must be 2D.";
   TVM_FFI_ICHECK_EQ(routing_logits->shape[0], hidden_states->shape[0])
       << "routing_logits and hidden_states must have the same number of tokens.";
@@ -333,18 +355,33 @@ void trtllm_fp8_block_scale_moe_launcher(
         << "routing_bias has incorrect shape.";
   }
 
-  //   if (n_group <= 0 || topk_group <= 0) {
-  //     TVM_FFI_ICHECK_EQ(top_k, 1) << "Current routing kernel (no groups) only supports top_k=1.";
-  //   } else {
-  //     TVM_FFI_ICHECK_LE(top_k, 8) << "Current routing kernel (with groups) only supports
-  //     top_k<=8."; TVM_FFI_ICHECK_LE(topk_group, 4)
-  //         << "Current routing kernel (with groups) only supports topk_group<=4.";
-  //     TVM_FFI_ICHECK_LE(topk_group, n_group) << "n_group must not be smaller than topk_group.";
-  //     TVM_FFI_ICHECK_EQ(num_experts % n_group, 0) << "num_experts must be divisible by n_group";
-  //     // This check ensures we have enough experts in the selected groups to handle the top_k
-  //     routing TVM_FFI_ICHECK_LT(top_k, (topk_group * num_experts / n_group))
-  //         << "top_k must be less than total number of experts in selected groups";
-  //   }
+  if (n_group.has_value() && n_group.value() != 0) {
+    TVM_FFI_ICHECK(static_cast<RoutingMethodType>(routing_method_type) ==
+                   RoutingMethodType::DeepSeekV3)
+        << "Routing kernel with groups implies DeepSeekV3 routing method.";
+    TVM_FFI_ICHECK(topk_group.has_value()) << "if n_group is given, topk_group must be given";
+    TVM_FFI_ICHECK_EQ(num_experts % n_group.value(), 0)
+        << "num_experts must be divisible by n_group";
+    TVM_FFI_ICHECK(top_k <= 8 && top_k > 0)
+        << "Current routing kernel (with groups) only supports top_k<=8 && top_k>0.";
+    TVM_FFI_ICHECK(topk_group.value() <= 4 && topk_group.value() > 0)
+        << "Current routing kernel only (with groups) supports topk_group<=4 && topk_group > 0.";
+    TVM_FFI_ICHECK_LE(topk_group.value(), n_group.value())
+        << "n_group must not be smaller than topk_group.";
+    // This check ensures we have enough experts in the selected groups to handle the top_k routing
+    TVM_FFI_ICHECK_LT(top_k, (topk_group.value() * num_experts / n_group.value()))
+        << "top_k must be less than total number of experts in selected groups";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::Renormalize ||
+             static_cast<RoutingMethodType>(routing_method_type) ==
+                 RoutingMethodType::RenormalizeNaive) {
+    TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
+        << "Current routing kernel (no groups, renormalize) only supports top_k<=10 && top_k>0.";
+  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
+             RoutingMethodType::Llama4) {
+    TVM_FFI_ICHECK_EQ(top_k, 1)
+        << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
+  }
   TVM_FFI_ICHECK_EQ(num_experts % 4, 0)
       << "Routing kernel expects that num_experts must be divisible by 4";
   TVM_FFI_ICHECK_GT(num_experts, top_k) << "num_experts must be greater than top_k";
@@ -380,11 +417,11 @@ void trtllm_fp8_block_scale_moe_launcher(
   args.hidden_size = hidden_states->shape[1];
   args.hidden_size_output = args.hidden_size;
   args.top_k = top_k;
-  args.n_group = n_group;
-  args.topk_group = topk_group;
+  args.n_group = n_group.has_value() ? n_group.value() : 0;
+  args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor;
+  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseDeepSeekFp8 = true;
 
@@ -569,10 +606,10 @@ void trtllm_fp8_block_scale_moe(Tensor routing_logits, Optional<Tensor> routing_
                                 Tensor hidden_states, Tensor hidden_states_scale,
                                 Tensor gemm1_weights, Tensor gemm1_weights_scale,
                                 Tensor gemm2_weights, Tensor gemm2_weights_scale, Tensor output,
-                                int64_t num_experts, int64_t top_k, int64_t n_group,
-                                int64_t topk_group, int64_t intermediate_size,
+                                int64_t num_experts, int64_t top_k, Optional<int64_t> n_group,
+                                Optional<int64_t> topk_group, int64_t intermediate_size,
                                 int64_t local_expert_offset, int64_t local_num_experts,
-                                double routed_scaling_factor, int64_t tile_tokens_dim,
+                                Optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
                                 int64_t routing_method_type, bool use_shuffled_weight,
                                 int64_t weight_layout, bool enable_pdl) {
   auto dtype = hidden_states->dtype;
diff --git a/csrc/trtllm_fused_moe_routing_renormalize.cu b/csrc/trtllm_fused_moe_routing_renormalize.cu
@@ -464,7 +464,6 @@ void run(Data const& data, void* stream) {
   }
   cudaDeviceSynchronize();
   cudaError_t result = cudaGetLastError();
-  std::cout << "cudaGetLastError: " << cudaGetErrorString(result) << std::endl;
 }
 
 // void run(Data const& data, void* stream) {
diff --git a/flashinfer/fused_moe/core.py b/flashinfer/fused_moe/core.py
@@ -1069,12 +1069,12 @@ def trtllm_fp8_per_tensor_scale_moe_op(
         output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
-        routed_scaling_factor: float,
+        routed_scaling_factor: Optional[float],
         use_routing_scales_on_input: bool,
         tile_tokens_dim: int = 8,
         routing_method_type: int = 0,
@@ -1119,8 +1119,8 @@ def _fake_trtllm_fp8_per_tensor_scale_moe(
         output2_scales_scalar: torch.Tensor,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
@@ -1151,8 +1151,8 @@ def trtllm_fp8_block_scale_moe_op(
         output: torch.Tensor,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
@@ -1207,8 +1207,8 @@ def _fake_trtllm_fp8_block_scale_moe(
         output: torch.Tensor,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
@@ -1469,12 +1469,12 @@ def trtllm_fp8_per_tensor_scale_moe(
     output2_scales_scalar: torch.Tensor,
     num_experts: int,
     top_k: int,
-    n_group: int,
-    topk_group: int,
+    n_group: Optional[int],
+    topk_group: Optional[int],
     intermediate_size: int,
     local_expert_offset: int,
     local_num_experts: int,
-    routed_scaling_factor: float,
+    routed_scaling_factor: Optional[float],
     use_routing_scales_on_input: bool,
     tile_tokens_dim: int = 8,
     routing_method_type: int = 0,
@@ -1542,8 +1542,8 @@ def trtllm_fp8_block_scale_moe(
     gemm2_weights_scale: torch.Tensor,
     num_experts: int,
     top_k: int,
-    n_group: int,
-    topk_group: int,
+    n_group: Optional[int],
+    topk_group: Optional[int],
     intermediate_size: int,
     local_expert_offset: int,
     local_num_experts: int,
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -105,7 +105,7 @@ def capture(self, hidden_states_sample, **runtime_args):
         self.input_tensor = hidden_states_sample.clone()
 
         # Warmup
-        with torch.cuda.stream(torch_stream), autotune(True):
+        with torch.cuda.stream(torch_stream), autotune(False):
             for _ in range(1):
                 self._run_moe_computation(runtime_args)
 
@@ -1836,9 +1836,9 @@ def cache_permute_indices():
     return _cache_permute_indices
 
 
-@pytest.mark.parametrize("num_tokens", [1, 8, 1024])
+@pytest.mark.parametrize("num_tokens", [1, 8, 1024,512])
 @pytest.mark.parametrize("hidden_size", [1024, 2048, 8192])
-@pytest.mark.parametrize("intermediate_size", [2048, 1024, 5120, 768, 384])
+@pytest.mark.parametrize("intermediate_size", [2048, 1024, 768, 384, 512])
 @pytest.mark.parametrize(
     "moe_impl",
     [
@@ -1905,15 +1905,15 @@ def cache_permute_indices():
         ),
         pytest.param(
             {
-                "num_experts": 512,
-                "top_k": 10,
+                "num_experts": 256,
+                "top_k": 8,
                 "padding": 8,
                 "n_groups": None,
                 "top_k_groups": None,
                 "routed_scaling": None,
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP4Moe, FP8PerTensorMoe, FP8BlockScaleMoe],
+                "compatible_moe_impls": [FP8PerTensorMoe, FP8BlockScaleMoe, FP4Moe],
             },
             id="Renorm",
             # marks=pytest.mark.skip(

Original file line number	Diff line number	Diff line change
`@@ -464,7 +464,6 @@ void run(Data const& data, void* stream) {`
`464`	`464`	`}`
`465`	`465`	`cudaDeviceSynchronize();`
`466`	`466`	`cudaError_t result = cudaGetLastError();`
`467`		`- std::cout << "cudaGetLastError: " << cudaGetErrorString(result) << std::endl;`
`468`	`467`	`}`
`469`	`468`
`470`	`469`	`// void run(Data const& data, void* stream) {`