[language] add transpose E

botbw · botbw · commit befe30ec0b76 · 2025-11-26T12:25:35.000+01:00
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
@@ -61,23 +61,24 @@ GemmSPPy::GemmSPPy(Array<PrimExpr> args, BufferMap vmap) {
   node->C = vmap[GetVarFromAccessPtr(node->Cptr)];
   node->trans_A = args[4].as<Bool>().value();
   node->trans_B = args[5].as<Bool>().value();
-  node->M = args[6].as<IntImm>().value()->value;
-  node->N = args[7].as<IntImm>().value()->value;
-  node->K = args[8].as<IntImm>().value()->value;
-  node->policy = GemmWarpPolicy(args[9].as<IntImm>().value()->value);
-  node->clear_accum = args[10].as<PrimExpr>().value();
-  node->stride_A = args[11].as<IntImm>().value()->value;
-  node->stride_B = args[12].as<IntImm>().value()->value;
-  node->offset_A = args[13].as<IntImm>().value()->value;
-  node->offset_B = args[14].as<IntImm>().value()->value;
-  if (args.size() > 15) {
-    node->kPack = args[15].as<IntImm>().value()->value;
+  node->trans_E = args[6].as<Bool>().value();
+  node->M = args[7].as<IntImm>().value()->value;
+  node->N = args[8].as<IntImm>().value()->value;
+  node->K = args[9].as<IntImm>().value()->value;
+  node->policy = GemmWarpPolicy(args[10].as<IntImm>().value()->value);
+  node->clear_accum = args[11].as<PrimExpr>().value();
+  node->stride_A = args[12].as<IntImm>().value()->value;
+  node->stride_B = args[13].as<IntImm>().value()->value;
+  node->offset_A = args[14].as<IntImm>().value()->value;
+  node->offset_B = args[15].as<IntImm>().value()->value;
+  if (args.size() > 16) {
+    node->kPack = args[16].as<IntImm>().value()->value;
     if (node->kPack != 1 && node->kPack != 2) {
       ICHECK(false) << "kPack must be 1 or 2";
     }
   }
-  if (args.size() > 16) {
-    node->wg_wait = args[16].as<IntImm>().value()->value;
+  if (args.size() > 17) {
+    node->wg_wait = args[17].as<IntImm>().value()->value;
   }
   data_ = std::move(node);
 }
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
@@ -24,7 +24,7 @@ class GemmSPPyNode : public TileOperatorNode {
   tir::Buffer A, E, B, C;
   // pointer to the A, E, B, C
   PrimExpr Aptr, Eptr, Bptr, Cptr;
-  bool trans_A, trans_B;
+  bool trans_A, trans_B, trans_E;
   int M, N, K;
   int stride_A, stride_B;
   int offset_A, offset_B;
@@ -51,6 +51,7 @@ class GemmSPPyNode : public TileOperatorNode {
         .def_ro("Cptr", &GemmSPPyNode::Cptr)
         .def_ro("trans_A", &GemmSPPyNode::trans_A)
         .def_ro("trans_B", &GemmSPPyNode::trans_B)
+        .def_ro("trans_E", &GemmSPPyNode::trans_E)
         .def_ro("M", &GemmSPPyNode::M)
         .def_ro("N", &GemmSPPyNode::N)
         .def_ro("K", &GemmSPPyNode::K)
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -139,6 +139,7 @@ def __init__(
         accum_dtype: str = "float16",
         a_transposed: bool = False,
         b_transposed: bool = False,
+        e_transposed: bool = False,
         block_row_warps: int = 2,
         block_col_warps: int = 2,
         warp_row_tiles: int = 8,
@@ -155,6 +156,7 @@ def __init__(
         self.accum_dtype = accum_dtype
         self.a_transposed = a_transposed
         self.b_transposed = b_transposed
+        self.e_transposed = e_transposed
         # Hint Information
         self.block_row_warps = block_row_warps
         self.block_col_warps = block_col_warps
@@ -362,6 +364,7 @@ def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk
         local_size_e = self.local_size_e
         a_dtype = self.a_dtype
         e_dtype = self.e_dtype
+        trans = self.e_transposed
         # ldmatrix cannot be used for int8 + trans case.
         # include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
         ldmatrix_available = False  # TODO: use ldmatrix when possible
@@ -413,7 +416,7 @@ def _warp_ldmatrix_e(
                     rk * warp_k + ki * micro_size_k) // self.e_factor
                 for j in T.serial(local_size_e):
                     mi, mk = mma_load_layout(tx, j)
-                    E_local_buf[i * local_size_e + j] = E_shared_buf[wi + mi, wk + mk]
+                    E_local_buf[i * local_size_e + j] =  E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
 
         return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
 
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
@@ -31,6 +31,7 @@ def gemm_sp(
         C (Union[tir.Buffer, tir.Var]): Output matrix for results
         transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
         transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
+        transpose_E (bool, optional): Whether to transpose matrix E. Defaults to False.
         policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
         clear_accum (bool, optional): Whether to clear accumulator before computation. Defaults to False.
         k_pack (int, optional): Number of k dimensions packed into a single warp. Defaults to 1.
@@ -95,6 +96,7 @@ def gemm_sp_v2(
     C: tir.Buffer | tir.Var,
     transpose_A: bool = False,
     transpose_B: bool = False,
+    transpose_E: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     clear_accum: bool = False,
     k_pack: int = 1,
@@ -293,6 +295,7 @@ def retrieve_offset(object: tir.Buffer | tir.BufferRegion) -> tir.PrimExpr:
         Cptr,
         transpose_A,
         transpose_B,
+        transpose_E,
         M,
         N,
         K,
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -49,6 +49,10 @@ def trans_A(self) -> bool:
     def trans_B(self) -> bool:
         return self.gemm_sp_node.trans_B
 
+    @property
+    def trans_E(self) -> bool:
+        return self.gemm_sp_node.trans_E
+
     @property
     def e_dtype(self) -> str:
         return self.E.dtype
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -23,6 +23,7 @@ def infer_layout(self, target: Target, thread_nums: int):
             accum_dtype=self.accum_dtype,
             a_transposed=self.trans_A,
             b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
             block_row_warps=m_warp,
             block_col_warps=n_warp,
             warp_row_tiles=warp_row_tiles,
@@ -69,6 +70,7 @@ def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
             accum_dtype=self.accum_dtype,
             a_transposed=self.trans_A,
             b_transposed=self.trans_B,
+            e_transposed=self.trans_E,
             block_row_warps=m_warp,
             block_col_warps=n_warp,
             warp_row_tiles=warp_row_tiles,