address comments

nv-yunzheq · nv-yunzheq · commit a30f033ff3eb · 2025-11-05T14:53:38.000-08:00
diff --git a/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh b/csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh
@@ -1298,9 +1298,6 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
   setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{},
                   quant_params.mxfp8_mxfp4);
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-  asm volatile("griddepcontrol.launch_dependents;");
-#endif
   assert(gemm_m <= INT32_MAX);
   assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
   assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
@@ -1319,6 +1316,9 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
       reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
           quant_params.groupwise.fc2.weight_scales),
       bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
 }
 
 // ========================== Permutation things =======================================
diff --git a/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu b/csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu
@@ -808,8 +808,7 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
       // default
       auto id1 = profile_ids.value()[0];
       if (id1 != -1) {
-        TVM_FFI_ICHECK(id1 >= 0 && id1 < static_cast<int64_t>(mAllProfiles.size()))
-            << "Invalid gemm1 profile id: " << id1;
+        TVM_FFI_ICHECK(id1 >= 0 && id1 < mGemm1TacticCount) << "Invalid gemm1 profile id: " << id1;
         best_gemm1_profile = mAllProfiles.at(id1);
       }
 
diff --git a/tests/moe/test_trtllm_cutlass_fused_moe.py b/tests/moe/test_trtllm_cutlass_fused_moe.py
@@ -1087,7 +1087,7 @@ def dequant_mxfp4_batches(
 )
 @pytest.mark.skipif(
     torch.cuda.get_device_capability()[0] not in [10, 11, 12],
-    reason="MXFP8xMXFP4 is only supported on SM100 and SM110",
+    reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",
 )
 def test_moe_mxfp8_mxfp4(
     batch_size,

Original file line number	Diff line number	Diff line change
`@@ -808,8 +808,7 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {`
`808`	`808`	`// default`
`809`	`809`	`auto id1 = profile_ids.value()[0];`
`810`	`810`	`if (id1 != -1) {`
`811`		`- TVM_FFI_ICHECK(id1 >= 0 && id1 < static_cast<int64_t>(mAllProfiles.size()))`
`812`		`- << "Invalid gemm1 profile id: " << id1;`
	`811`	`+ TVM_FFI_ICHECK(id1 >= 0 && id1 < mGemm1TacticCount) << "Invalid gemm1 profile id: " << id1;`
`813`	`812`	`best_gemm1_profile = mAllProfiles.at(id1);`
`814`	`813`	`}`
`815`	`814`
Original file line number	Diff line number	Diff line change
`@@ -1087,7 +1087,7 @@ def dequant_mxfp4_batches(`
`1087`	`1087`	`)`
`1088`	`1088`	`@pytest.mark.skipif(`
`1089`	`1089`	`torch.cuda.get_device_capability()[0] not in [10, 11, 12],`
`1090`		`- reason="MXFP8xMXFP4 is only supported on SM100 and SM110",`
	`1090`	`+ reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",`
`1091`	`1091`	`)`
`1092`	`1092`	`def test_moe_mxfp8_mxfp4(`
`1093`	`1093`	`batch_size,`