Skip to content

Commit a30f033

Browse files
committed
address comments
1 parent 22b97b0 commit a30f033

File tree

3 files changed

+5
-6
lines changed

3 files changed

+5
-6
lines changed

csrc/fused_moe/cutlass_backend/cutlass_fused_moe_kernels.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,9 +1298,6 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
12981298
setupIfSelected(TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaledConfig{},
12991299
quant_params.mxfp8_mxfp4);
13001300

1301-
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1302-
asm volatile("griddepcontrol.launch_dependents;");
1303-
#endif
13041301
assert(gemm_m <= INT32_MAX);
13051302
assert(gemm1_n > 0 && gemm1_n <= INT32_MAX);
13061303
assert(gemm1_k > 0 && gemm1_k <= INT32_MAX);
@@ -1319,6 +1316,9 @@ __global__ void computeStridesTmaWarpSpecializedKernel(
13191316
reinterpret_cast<TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::SFA const*>(
13201317
quant_params.groupwise.fc2.weight_scales),
13211318
bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert);
1319+
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
1320+
asm volatile("griddepcontrol.launch_dependents;");
1321+
#endif
13221322
}
13231323

13241324
// ========================== Permutation things =======================================

csrc/fused_moe/cutlass_backend/flashinfer_cutlass_fused_moe_sm100_binding.cu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -808,8 +808,7 @@ class FusedMoeRunner : public tvm::ffi::ModuleObj {
808808
// default
809809
auto id1 = profile_ids.value()[0];
810810
if (id1 != -1) {
811-
TVM_FFI_ICHECK(id1 >= 0 && id1 < static_cast<int64_t>(mAllProfiles.size()))
812-
<< "Invalid gemm1 profile id: " << id1;
811+
TVM_FFI_ICHECK(id1 >= 0 && id1 < mGemm1TacticCount) << "Invalid gemm1 profile id: " << id1;
813812
best_gemm1_profile = mAllProfiles.at(id1);
814813
}
815814

tests/moe/test_trtllm_cutlass_fused_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ def dequant_mxfp4_batches(
10871087
)
10881088
@pytest.mark.skipif(
10891089
torch.cuda.get_device_capability()[0] not in [10, 11, 12],
1090-
reason="MXFP8xMXFP4 is only supported on SM100 and SM110",
1090+
reason="MXFP8xMXFP4 is only supported on SM100, SM110 and SM120",
10911091
)
10921092
def test_moe_mxfp8_mxfp4(
10931093
batch_size,

0 commit comments

Comments
 (0)