From d56a5b859f26f59d2a5c0bb9e189e6efc104916d Mon Sep 17 00:00:00 2001 From: zhyncs Date: Sat, 3 May 2025 22:23:50 -0700 Subject: [PATCH 1/2] chore: upgrade cutlass 3.9.2 --- sgl-kernel/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgl-kernel/CMakeLists.txt b/sgl-kernel/CMakeLists.txt index 3587607a6b4..f7733388a48 100755 --- a/sgl-kernel/CMakeLists.txt +++ b/sgl-kernel/CMakeLists.txt @@ -45,7 +45,7 @@ include(FetchContent) FetchContent_Declare( repo-cutlass GIT_REPOSITORY https://github.com/NVIDIA/cutlass - GIT_TAG e94e888df3551224738bfa505787b515eae8352f + GIT_TAG ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e GIT_SHALLOW OFF ) FetchContent_Populate(repo-cutlass) From 23999a6a2a4b8bcdb71cd805822a0015e25dee69 Mon Sep 17 00:00:00 2001 From: yizhang2077 <1109276519@qq.com> Date: Tue, 6 May 2025 18:33:38 +0000 Subject: [PATCH 2/2] fix fp8_blockwise_gemm kernel --- .../csrc/gemm/fp8_blockwise_gemm_kernel.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu b/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu index 3ed96d067d9..609134730f0 100644 --- a/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu +++ b/sgl-kernel/csrc/gemm/fp8_blockwise_gemm_kernel.cu @@ -384,16 +384,23 @@ torch::Tensor fp8_blockwise_scaled_mm( auto sm_version = getSMVersion(); + int64_t original_rows = mat_a.size(0); + torch::Tensor mat_a_padded = pad_tensor(mat_a, /*alignment=*/4); + torch::Tensor scales_a_padded = pad_tensor(scales_a, /*alignment=*/4, /*col_major=*/true); + torch::Tensor out_padded = torch::empty({mat_a_padded.size(0), mat_b.size(1)}, out.options()); + #if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) #if defined CUDA_VERSION && CUDA_VERSION >= 12000 if (sm_version == 90) { torch::Tensor scales_b_contiguous = scales_b.contiguous(); if (out_dtype == torch::kBFloat16) { - sm90_fp8_blockwise_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b_contiguous); + sm90_fp8_blockwise_dispatch_shape( + out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b_contiguous); } else { - sm90_fp8_blockwise_dispatch_shape(out, mat_a, mat_b, scales_a, scales_b_contiguous); + sm90_fp8_blockwise_dispatch_shape( + out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b_contiguous); } - return out; + return out_padded.slice(0, 0, original_rows); } #endif #endif @@ -401,12 +408,6 @@ torch::Tensor fp8_blockwise_scaled_mm( #if defined(CUTLASS_ARCH_MMA_SM100A_SUPPORTED) || defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) #if defined CUDA_VERSION && CUDA_VERSION >= 12080 if (sm_version == 100) { - int64_t original_rows = mat_a.size(0); - - torch::Tensor mat_a_padded = pad_tensor(mat_a, /*alignment=*/4); - torch::Tensor scales_a_padded = pad_tensor(scales_a, /*alignment=*/4, /*col_major=*/true); - torch::Tensor out_padded = torch::empty({mat_a_padded.size(0), mat_b.size(1)}, out.options()); - if (out_dtype == torch::kBFloat16) { sm100_fp8_blockwise_dispatch_shape( out_padded, mat_a_padded, mat_b, scales_a_padded, scales_b);