[Feature] Enhance GEMM operations with MMA and WGMMA support

Rachmanino · Rachmanino · commit d86773985302 · 2025-10-29T13:47:33.000+08:00
- Added a mapping for GEMM instruction prefixes in `gemm.h`.
- Renamed GEMM functions to include `mma_` prefix for clarity in `gemm_mma.h`, `gemm_sm70.h`, `gemm_sm90.h`, and `gemm_sp_sm80.h`.
- Updated function signatures to improve consistency and readability.
- Introduced new functions for MMA and WGMMA operations in `gemm_sm90.h` and `gemm_sp_sm90.h`.
- Added a utility function for parsing arguments in `inject_fence_proxy.cc` to enhance async handling.
diff --git a/examples/sparse_tensorcore/example_sparse_tensorcore.py b/examples/sparse_tensorcore/example_sparse_tensorcore.py
@@ -2,7 +2,7 @@
 import tilelang
 from tilelang.utils.sparse import compress_sm90
 from tilelang.layout import make_metadata_layout
-import tilelang.testing
+import tilelang.language as T
 
 
 @tilelang.jit(out_idx=[-1])
@@ -24,7 +24,6 @@ def matmul_sp(
     A_shared_shape = (block_M, block_K // 2)
     B_shared_shape = (block_K, block_N)
 
-    import tilelang.language as T
 
     @T.prim_func
     def main(
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
@@ -578,14 +578,16 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
   if (A.scope() == "local.fragment") {
     ICHECK(B.scope() != "local.fragment");
-    op_name = "tl::gemm_rs";
+    op_name = "gemm_rs";
   } else if (B.scope() == "local.fragment") {
-    op_name = "tl::gemm_sr";
+    op_name = "gemm_sr";
   } else {
-    op_name = "tl::gemm_ss";
+    op_name = "gemm_ss";
   }
   ICHECK(C.scope() == "local.fragment");
 
+  op_name = "tl::" + GemmInstPrefixMap.at(gemm_inst) + "_" + op_name;
+
   ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
   ss << warp_m << ", " << warp_n << ", ";
   ss << trans_A << ", " << trans_B;
@@ -600,8 +602,6 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   if (TargetIsCDNA(T.target)) {
     // for cdna gemm, we need to specify kPack
     ss << ", " << kPack;
-  } else if (TargetIsHopper(T.target)) {
-    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
   }
 
   // Emit wg_wait if necessary
diff --git a/src/op/gemm.h b/src/op/gemm.h
@@ -24,6 +24,14 @@ enum class GemmWarpPolicyType : uint8_t {
 
 // Target GEMM instruction
 enum class GemmInst : uint8_t { kMMA, kWGMMA, kTCGEN5MMA, kMFMA };
+const std::unordered_map<GemmInst, std::string> GemmInstPrefixMap = {
+    {GemmInst::kMMA, "mma"},
+    {GemmInst::kWGMMA, "wgmma"},
+    {GemmInst::kTCGEN5MMA, "tcgen5"},
+    {GemmInst::kMFMA, "mfma"}
+};
+
+
 class GemmWarpPolicyNode : public Object {
 public:
   mutable int m_warp{0};
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
@@ -146,12 +146,13 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto block_size = *as_const_int(T.thread_bounds->extent);
   bool maybe_wgmma = TargetIsHopper(T.target) && (this->M >= 64) &&
                      (block_size / warp_size % 4 == 0);
+  GemmInst gemm_inst = maybe_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA;
 
   auto [warp_m, warp_n] = policy->ComputeWarpPartition(
       M, N, block_size, T.target, maybe_wgmma, A->dtype.bits());
 
   std::stringstream ss;
-  std::string op_name = "tl::gemm_sp_ss";
+  std::string op_name = "tl::" + GemmInstPrefixMap.at(gemm_inst) + "_gemm_sp_ss";
   ICHECK((A.scope() == "shared" || A.scope() == "shared.dyn") &&
          (B.scope() == "shared" || B.scope() == "shared.dyn"))
       << "Only support shared.dyn scope for A and B, but received " << A.scope()
@@ -160,13 +161,11 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       << "Only support shared.dyn scope for E as copy from smem to rmem are "
          "delegated to cute implementation, found "
       << E.scope();
+
   ss << op_name << "<" << M << ", " << N << ", " << K << ", ";
   ss << warp_m << ", " << warp_n << ", ";
   ss << trans_A << ", " << trans_B;
   ss << ", " << clear_accum;
-  if (TargetIsHopper(T.target)) {
-    ss << ", " << (maybe_wgmma ? "true" : "false");
-  }
   if (wg_wait != 0) {
     ss << ", " << wg_wait;
   }
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
@@ -449,7 +449,7 @@ namespace tl::tl_mma {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
           int offset_b, typename A_type, typename B_type, typename C_type>
-CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
+CUTLASS_DEVICE void mma_gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
                                  trans_B, clear_accum, lda, ldb, offset_a,
@@ -460,7 +460,7 @@ CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
           int offset_b, typename A_type, typename B_type, typename C_type>
-CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
+CUTLASS_DEVICE void mma_gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
                                  trans_B, clear_accum, lda, ldb, offset_a,
@@ -471,7 +471,7 @@ CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, int lda, int ldb, int offset_a,
           int offset_b, typename A_type, typename B_type, typename C_type>
-CUTLASS_DEVICE void gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
+CUTLASS_DEVICE void mma_gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
   using MMA =
       cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
                                  trans_B, clear_accum, lda, ldb, offset_a,
diff --git a/src/tl_templates/cuda/gemm_sm70.h b/src/tl_templates/cuda/gemm_sm70.h
@@ -161,7 +161,7 @@ namespace tl {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, typename A_type, typename B_type,
           typename C_type>
-CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
+CUTLASS_DEVICE void wmma_gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
   using MMA = GemmTensorOp<GemmShape<M, N, K>, num_warp_m, num_warp_n, trans_A,
                            trans_B, clear_accum, A_type, B_type, C_type>;
   using FragmentC = typename MMA::FragmentC;
@@ -174,7 +174,7 @@ CUTLASS_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum, typename A_type, typename B_type,
           typename C_type>
-CUTLASS_DEVICE void gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
+CUTLASS_DEVICE void wmma_gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
   using MMA = GemmTensorOp<GemmShape<M, N, K>, num_warp_m, num_warp_n, trans_A,
                            trans_B, clear_accum, A_type, B_type, C_type>;
   using FragmentA = typename MMA::FragmentA;
diff --git a/src/tl_templates/cuda/gemm_sm90.h b/src/tl_templates/cuda/gemm_sm90.h
@@ -232,43 +232,45 @@ namespace tl {
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
-          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
+          int offset_a = 0, int offset_b = 0,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type>
-TL_DEVICE void gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
-  if constexpr (use_wgmma) {
-    static_assert((trans_A && lda == M) || (!trans_A && lda == K),
-                  "Hopper wgmma doesn't support custom stride for A");
-    static_assert((trans_B && ldb == K) || (!trans_B && ldb == N),
-                  "Hopper wgmma doesn't support custom stride for B");
-    static_assert(offset_a == 0 && offset_b == 0,
-                  "offset_a and offset_b must be zero for wgmma");
-    using MMA = cute::tl_wgmma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
-                                             trans_A, trans_B, clear_accum,
-                                             A_type, B_type, C_type>;
-    MMA::body<wg_wait>(pA, pB, accum);
-  } else {
-    using MMA =
-        cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+TL_DEVICE void wgmma_gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
+  static_assert((trans_A && lda == M) || (!trans_A && lda == K),
+                "Hopper wgmma doesn't support custom stride for A");
+  static_assert((trans_B && ldb == K) || (!trans_B && ldb == N),
+                "Hopper wgmma doesn't support custom stride for B");
+  static_assert(offset_a == 0 && offset_b == 0,
+                "offset_a and offset_b must be zero for wgmma");
+  using MMA = cute::tl_wgmma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
+                                            trans_A, trans_B, clear_accum,
+                                            A_type, B_type, C_type>;
+  MMA::body<wg_wait>(pA, pB, accum);
+}
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
+          int offset_a = 0, int offset_b = 0,
+          int wg_wait = 0, typename A_type, typename B_type, typename C_type>
+TL_DEVICE void mma_gemm_ss(A_type *pA, B_type *pB, C_type *accum) {
+  using MMA = cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
                                    trans_B, clear_accum, lda, ldb, offset_a,
                                    offset_b, A_type, B_type, C_type>;
-    MMA::body(pA, pB, accum);
-  }
+  MMA::body(pA, pB, accum);
 }
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
-          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
+          int offset_a = 0, int offset_b = 0,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type>
 TL_DEVICE /**
            * Perform a read-share (B in shared memory, A in global) tiled GEMM
            * and accumulate into `accum`.
            *
-           * Dispatches at compile time to either the Hopper wgmma
-           * implementation or the fallback MMA implementation depending on
-           * `use_wgmma`. The selected GemmTensorOp::body_rs performs the
+           * Dispatches at compile time to the Hopper wgmma
+           * implementation. The selected GemmTensorOp::body_rs performs the
            * region-tiled GEMM loop and updates the accumulator in-place.
            *
-           * When `use_wgmma == true`, this function enforces wgmma constraints
+           * This function enforces wgmma constraints
            * at compile time:
            * - A's leading dimension must equal (trans_A ? M : K)
            * - B's leading dimension must equal (trans_B ? K : N)
@@ -281,40 +283,57 @@ TL_DEVICE /**
            * @param accum Pointer to the accumulator/output C buffer updated
            * in-place.
            */
-    void
-    gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
-  if constexpr (use_wgmma) {
+  void wgmma_gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
     static_assert((trans_A && lda == M) || (!trans_A && lda == K),
                   "Hopper wgmma doesn't support custom stride for A");
     static_assert((trans_B && ldb == K) || (!trans_B && ldb == N),
                   "Hopper wgmma doesn't support custom stride for B");
     static_assert(offset_a == 0 && offset_b == 0,
                   "offset_a and offset_b must be zero for wgmma");
     using MMA = cute::tl_wgmma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n,
-                                             trans_A, trans_B, clear_accum,
-                                             A_type, B_type, C_type>;
+                                              trans_A, trans_B, clear_accum,
+                                              A_type, B_type, C_type>;
     MMA::body_rs<wg_wait>(pA, pB, accum);
-  } else {
+}
+
+template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
+          bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
+          int offset_a = 0, int offset_b = 0,
+          int wg_wait = 0, typename A_type, typename B_type, typename C_type>
+TL_DEVICE /**
+           * Perform a read-share (B in shared memory, A in global) tiled GEMM
+           * and accumulate into `accum`.
+           *
+           * Dispatches at compile time to the fallback mma
+           * implementation. The selected GemmTensorOp::body_rs performs the
+           * region-tiled GEMM loop and updates the accumulator in-place.
+           *
+           * @param pA Pointer to operand A (global memory). Layout/stride
+           * expectations depend on template parameters.
+           * @param pB Pointer to operand B (base for shared-memory staging).
+           * Layout/stride expectations depend on template parameters.
+           * @param accum Pointer to the accumulator/output C buffer updated
+           * in-place.
+           */
+  void mma_gemm_rs(A_type *pA, B_type *pB, C_type *accum) {
     using MMA =
         cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                   trans_B, clear_accum, lda, ldb, offset_a,
-                                   offset_b, A_type, B_type, C_type>;
+                                    trans_B, clear_accum, lda, ldb, offset_a,
+                                    offset_b, A_type, B_type, C_type>;
     MMA::body_rs(pA, pB, accum);
-  }
 }
 
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum = false, int lda = 0, int ldb = 0,
-          int offset_a = 0, int offset_b = 0, bool use_wgmma = true,
+          int offset_a = 0, int offset_b = 0,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type>
 TL_DEVICE /**
            * Perform a non-wgmma tiled GEMM where A regions are staged into
            * shared memory and B is read directly from global memory,
            * accumulating into `accum`.
            *
            * This overload dispatches to the tl_mma::GemmTensorOp::body_sr
-           * implementation. Must be instantiated with `use_wgmma = false`
-           * (enforced via static_assert).
+           * implementation. 
            *
            * @param pA Pointer to the A operand in global memory (source that
            * will be staged to shared memory).
@@ -323,14 +342,12 @@ TL_DEVICE /**
            * @param accum Pointer to the output accumulator matrix in global
            * memory.
            */
-    void
-    gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
-  static_assert(!use_wgmma, "wgmma doesn't support gemm_sr");
-  using MMA =
-      cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
-                                 trans_B, clear_accum, lda, ldb, offset_a,
-                                 offset_b, A_type, B_type, C_type>;
-  MMA::body_sr(pA, pB, accum);
+  void mma_gemm_sr(A_type *pA, B_type *pB, C_type *accum) {
+    using MMA =
+        cute::tl_mma::GemmTensorOp<M, N, K, num_warp_m, num_warp_n, trans_A,
+                                  trans_B, clear_accum, lda, ldb, offset_a,
+                                  offset_b, A_type, B_type, C_type>;
+    MMA::body_sr(pA, pB, accum);
 }
 
 template <int num_mma>
diff --git a/src/tl_templates/cuda/gemm_sp_sm80.h b/src/tl_templates/cuda/gemm_sp_sm80.h
@@ -255,7 +255,7 @@ class GemmTensorOp {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
           bool trans_B, bool clear_accum = false, typename A_type,
           typename B_type, typename C_type, typename E_type>
-TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
+TL_DEVICE void mma_gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
   using MMA =
       GemmTensorOp<cutlass::gemm::GemmShape<M, N, K>, num_warp_m, num_warp_n,
                    trans_A, trans_B, clear_accum, A_type, B_type, C_type>;
diff --git a/src/tl_templates/cuda/gemm_sp_sm90.h b/src/tl_templates/cuda/gemm_sp_sm90.h
@@ -215,18 +215,13 @@ class GemmTensorOp {
 
 namespace tl {
 template <int M, int N, int K, int num_warp_m, int num_warp_n, bool trans_A,
-          bool trans_B, bool clear_accum = false, bool use_wgmma = true,
+          bool trans_B, bool clear_accum = false,
           int wg_wait = 0, typename A_type, typename B_type, typename C_type,
           typename GMMA = cute::tl_wgmma_sp::GemmTensorOp<
               M, N, K, num_warp_m, num_warp_n, trans_A, trans_B, clear_accum,
               A_type, B_type, C_type>,
           typename E_type = typename GMMA::ElementEMma::raw_type>
-TL_DEVICE void gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
-  static_assert(use_wgmma, "only wgmma is supported for now");
-  if constexpr (use_wgmma) {
-    GMMA::body<wg_wait>(pA, pB, accum, pE);
-  } else {
-    CUTE_GCC_UNREACHABLE;
-  }
+TL_DEVICE void wgmma_gemm_sp_ss(A_type *pA, B_type *pB, C_type *accum, E_type *pE) {
+  GMMA::body<wg_wait>(pA, pB, accum, pE);
 }
 } // namespace tl
diff --git a/src/transform/inject_fence_proxy.cc b/src/transform/inject_fence_proxy.cc
@@ -79,24 +79,18 @@ bool IsAsyncIntrinsic(const CallNode *call) {
   }
 
   // TileLang async intrinsics
-  if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col()) ||
-      call->op.same_as(tma_store()) || call->op.same_as(tma_store_arrive()) ||
-      call->op.same_as(tma_store_wait()) ||
-      call->op.same_as(ptx_cp_async_barrier_noinc()) ||
+  // NOTE(wt): We only need to inject fences before tma_store and WGMMA,
+  // since tma_load and WGMMA contain implicit proxy fence after them
+  if (call->op.same_as(tma_store()) ||  
       call->op.same_as(ptx_wgmma_ss()) || call->op.same_as(ptx_wgmma_rs())) {
     return true;
   }
 
-  // PTX async copy intrinsics
-  if (call->op.same_as(builtin::ptx_cp_async()) ||
-      call->op.same_as(builtin::ptx_cp_async_barrier()) ||
-      call->op.same_as(builtin::ptx_cp_async_bulk())) {
-    return true;
-  }
-
-  // wgmma async intrinsics
   if (call->op.same_as(tl_gemm()) || call->op.same_as(tl_gemm_sp())) {
-    return true;
+    // determine whether async wgmma is utilized
+    std::ostringstream oss;
+    oss << call->args[0].as<StringImmNode>()->value;
+    return oss.str().find("wgmma") != std::string::npos;
   }
 
   return false;
@@ -174,6 +168,7 @@ class ProxyFenceInjector : public StmtMutator {
 
 private:
   Stmt VisitStmt_(const SeqStmtNode *op) final {
+    // FIXME: 1st stmt cannot know the previous proxy kind
     Array<Stmt> seq;
     seq.reserve(op->seq.size());
 
@@ -213,7 +208,8 @@ class ProxyFenceInjector : public StmtMutator {
       } else if (IsKnownGeneric(call)) {
         kind = ProxyKind::kGeneric;
       } else {
-        // We can now treat extern as Generic, since gemm and gemm_sp are never
+        // Remaining intrinsic and extern are marked as Generic.
+        // We can now all extern as Generic, since gemm and gemm_sp are never
         // represented as call_extern nodes. They are call_intrin nodes and will
         // be handled by IsAsyncIntrinsic above.
         kind = ProxyKind::kGeneric;
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -32,7 +32,7 @@ def before():
             for i in T.unroll(16):
                 C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                          "tl::wgmma_gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
                           T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
                           T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
                           T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))
@@ -47,7 +47,7 @@ def after():
                 C_local[i * 2:i * 2 + 2] = T.Broadcast(T.float32(0), 2)
             T.fence_proxy_async()
             T.call_intrin("handle", tir.op.Op.get("tl.tl_gemm"),
-                          "tl::gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
+                          "tl::wgmma_gemm_ss<128, 128, 32, 4, 1, 0, 0, 0, 32, 128, 0, 0, true>",
                           T.tvm_access_ptr(T.type_annotation("float16"), A_shared.data, 0, 2048, 1),
                           T.tvm_access_ptr(T.type_annotation("float16"), B_shared.data, 0, 2048, 1),
                           T.tvm_access_ptr(T.type_annotation("float32"), C_local.data, 0, 32, 3))