[TRTLLM-6286] [feat] Update CUTLASS to 4.2 and enable SM103 group gemm (NVIDIA#7832)

VALLIS-NERIA · dominicshanshan · commit 718827f2f9a0 · 2025-09-19T06:19:49.000-07:00
Signed-off-by: Xiwen Yu &lt;13230610+VALLIS-NERIA@users.noreply.github.com&gt;
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
@@ -1 +1 @@
-Subproject commit a49a78ffefc86a87160dfe0ccc3a3a2d1622c918
+Subproject commit 57e3cfb47a2d9e0d46eb6335c3dc411498efa198
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
@@ -138,22 +138,23 @@ void dispatchMoeGemmFinalDispatchTmaWarpSpecialized(TmaWarpSpecializedGroupedGem
         TLLM_THROW("Please recompile with support for hopper by passing 90-real as an arch to build_wheel.py.");
     }
 #endif
-    // #ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
+#ifndef COMPILE_BLACKWELL_SM103_TMA_GROUPED_GEMMS
     else if constexpr (Arch::kMinComputeCapability == 103)
     {
         static std::once_flag flag;
         std::call_once(flag,
             []()
             {
                 TLLM_LOG_WARNING(
-                "Falling back to sm100f version due to a bug in cutlass." /*"For best performance please recompile with support for blackwell by "
-                "passing 103-real as an arch to build_wheel.py."*/);
+                    "For best performance please recompile with support for blackwell by "
+                    "passing 103-real as an arch to build_wheel.py.");
             });
-        return dispatchMoeGemmFinalDispatchTmaWarpSpecialized<cutlass::arch::Sm100, T, WeightType, OutputType,
-            EpilogueTag, FUSION, TileShape, ClusterShape>(
+        dispatchMoeGemmFinalDispatchTmaWarpSpecialized<cutlass::arch::Sm100, T, WeightType, OutputType, EpilogueTag,
+            FUSION, TileShape, ClusterShape>(
             hopper_input, num_experts, gemm_config, multi_processor_count, stream, occupancy, workspace_size);
+        return;
     }
-// #endif
+#endif
 #ifndef COMPILE_BLACKWELL_TMA_GROUPED_GEMMS
     else if constexpr (Arch::kMinComputeCapability >= 100 && Arch::kMinComputeCapability < 120)
     {
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
@@ -3,19 +3,6 @@
 import os
 from itertools import chain, product
 
-file_to_patch = os.path.abspath(
-    os.path.join(
-        os.path.dirname(__file__),
-        "../../../../../3rdparty/cutlass/python/cutlass_library/heuristics_provider.py"
-    ))
-# replace "from library import" to "from cutlass_library.library import"
-with open(file_to_patch, "r") as f:
-    file_contents = f.read()
-with open(file_to_patch, "w") as f:
-    f.write(
-        file_contents.replace("from library import",
-                              "from cutlass_library.library import"))
-
 from cutlass_library import *