tile-ai · LeiWang1999 · Oct 9, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 15, 2025
diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
@@ -177,8 +177,8 @@ Fragment makeGemmFragmentCHopper(const int block_m, const int block_n,
                                  const int warp_m, const int warp_n,
                                  const int element_size) {
   ICHECK(block_m % warp_m == 0);
-  // ICHECK(block_n == warp_n);
   ICHECK(warp_m % 16 == 0) << "warp_m=" << warp_m;
+
   auto warp_layout = makeGemmFragment8x8()->Repeat({2, warp_n / 8}, false,
                                                    false); // 16 x N (1 warp)
   auto block_layout = warp_layout->Repeat({block_m / warp_m, block_n / warp_n},
@@ -385,6 +385,7 @@ Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
   Var i = InputPlaceholder(0);
   Var j = InputPlaceholder(1);
   int vector_size = 128 / element_size;
+  LOG(INFO) << "makeQuarterBankSwizzleLayout: " << stride << ", " << continuous << ", " << element_size;
   ICHECK(stride % 8 == 0) << "stride=" << stride;
   ICHECK(continuous % (vector_size * 2) == 0)
       << "continuous=" << continuous << ", vector_size=" << vector_size;
@@ -576,8 +577,8 @@ Layout MakeGemmVoltaBLayoutCongruous(int stride, int continuous) {
 }
 
 Layout makeGemmVoltaABLayout(int stride, int continuous, bool is_a,
-                             int kfactor) {
-  if (kfactor == 2)
+                             bool k_inner) {
+  if (k_inner)
     return MakeGemmVoltaABLayoutCrosswise(stride, continuous);
   if (is_a && continuous % 64 == 0)
     return MakeGemmVoltaALayoutCongruous(stride, continuous);
@@ -705,29 +706,29 @@ Layout makeGemmSparseAmpereABLayout(int mat_stride, int mat_continuous,
  * select specific swizzling strategies. It might be the same as mat_continuous
  *                   or different based on tiling or hardware details.
  * \param element_size The size of each element in the matrix, in bits (e.g., 8,
- * 16, 32, 64). \param kfactor An integer factor that influences layout
+ * 16, 32, 64). \param k_inner Whether the K dimension is in the inner loop.
  * selection, particularly for fp64 and int8 types. It often relates to how the
  * K dimension of the GEMM (M x K * K x N) is handled or tiled.
  *                - For fp64 (element_size == 64):
- *                  - kfactor == 1 often implies K is in the "outer" loop (e.g.,
- * KxN matrix).
- *                  - kfactor == 2 often implies K is in the "inner" loop (e.g.,
- * NxK matrix).
+ *                  - k_inner == false often implies K is in the "outer" loop
+ * (e.g., KxN matrix).
+ *                  - k_inner == true often implies K is in the "inner" loop
+ * (e.g., NxK matrix).
  *                - For int8 (element_size == 8):
- *                  - kfactor == 1 uses a padded layout.
+ *                  - k_inner == false uses a padded layout.
  * \return A Layout object representing the chosen memory layout.
  */
 Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
-                        int element_size, int kfactor) {
+                        int element_size, bool k_inner) {
   if (element_size == 64) {
-    if (kfactor == 1 && continuity % 16 == 0) // float64 KxN
+    if (!k_inner && continuity % 16 == 0) // float64 KxN
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
-    if (kfactor == 2 && continuity % 16 == 0) // float64 NxK
+    if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   }
   int vector_size = 128 / element_size;
-  if (kfactor == 1 && element_size == 8) // int8 KxN
+  if (!k_inner && element_size == 8) // int8 KxN
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   else if (mat_continuous % (vector_size * 8) == 0)
     return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
@@ -739,16 +740,23 @@ Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
 }
 
 Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
-                              int continuity, int element_size, int kfactor) {
+                              int continuity, int element_size, bool k_inner) {
+  LOG(INFO) << "makeGemmABLayoutHopper: " << mat_stride << ", " << mat_continuous << ", " << continuity << ", " << element_size << ", " << k_inner;
   if (element_size == 64) {
-    if (kfactor == 1 && continuity % 16 == 0) // float64 KxN
+    if (!k_inner && continuity % 16 == 0) // float64 KxN
       return makeGemmABLayoutF64_Kouter(mat_stride, mat_continuous);
-    if (kfactor == 2 && continuity % 16 == 0) // float64 NxK
+    if (k_inner && continuity % 16 == 0) // float64 NxK
       return makeGemmABLayoutF64_Kinner(mat_stride, mat_continuous);
     return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
                                         element_size);
   }
   int vector_size = 128 / element_size;
+  LOG(INFO) << "makeGemmABLayoutHopper: mat_continuous: " << mat_continuous << ", mat_stride: " << mat_stride << ", element_size: " << element_size;
+  LOG(INFO) << "vector_size: " << vector_size;
+  LOG(INFO) << "mat_continuous % (vector_size * 8): " << mat_continuous % (vector_size * 8);
+  LOG(INFO) << "mat_continuous % (vector_size * 4): " << mat_continuous % (vector_size * 4);
+  LOG(INFO) << "mat_continuous % (vector_size * 2): " << mat_continuous % (vector_size * 2);
+  LOG(INFO) << "mat_continuous % vector_size: " << mat_continuous % vector_size;
   if (mat_continuous % (vector_size * 8) == 0)
     return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
   else if (mat_continuous % (vector_size * 4) == 0)
@@ -761,11 +769,11 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
   else
     ICHECK(0) << "Unsupported layout for Hopper with stride=" << mat_stride
               << ", continuous=" << mat_continuous
-              << ", element_size=" << element_size << ", kfactor=" << kfactor;
+              << ", element_size=" << element_size << ", k_inner=" << k_inner;
 }
 
 Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
-                             int element_size, int kfactor) {
+                             int element_size, bool k_inner) {
   if (element_size == 64) {
     ICHECK(0) << "float64 on sm100 is not supported now";
   }
@@ -782,7 +790,7 @@ Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
   else
     ICHECK(0) << "Unsupported layout for sm100 with stride=" << mat_stride
               << ", continuous=" << mat_continuous
-              << ", element_size=" << element_size << ", kfactor=" << kfactor;
+              << ", element_size=" << element_size << ", k_inner=" << k_inner;
   __builtin_unreachable(); // to prevent compiler warning
 }
 

diff --git a/src/layout/layout.cc b/src/layout/layout.cc
@@ -484,6 +484,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
            [](Layout layout) { return layout->GetForwardIndex(); })
       .def("tl.Layout_forward_vars",
            [](Layout layout) { return layout->GetForwardVars(); })
+      .def("tl.Layout_is_equal",
+           [](Layout layout, Layout other) {
+             const LayoutNode *other_node = other.as<LayoutNode>();
+             return layout->IsEqual(other_node);
+           })
       .def_packed("tl.Fragment",
                   [](PackedArgs args, Any *rv) {
                     *rv = Fragment(
@@ -492,6 +497,11 @@ TVM_FFI_STATIC_INIT_BLOCK({
                         /*forward_thread=*/args[2].cast<PrimExpr>(),
                         /*thread_replicate=*/args[3].cast<IterVar>());
                   })
+      .def("tl.Fragment_is_equal",
+           [](Fragment fragment, Fragment other) {
+             const FragmentNode *other_node = other.as<FragmentNode>();
+             return fragment->IsEqual(other_node);
+           })
       .def("tl.Fragment_thread_size",
            [](Fragment fragment) { return fragment->ThreadExtent(); })
       .def("tl.Fragment_thread",
@@ -509,9 +519,38 @@ TVM_FFI_STATIC_INIT_BLOCK({
       .def("tl.Fragment_condense_rep_var",
            [](Fragment fragment) { return fragment->CondenseReplicateVar(); })
       .def("tl.make_swizzled_layout",
+           [](int stride, int continuous, int element_size, bool k_inner,
+              bool allow_pad = true) {
+             if (allow_pad) {
+               return makeGemmABLayout(stride, continuous, continuous,
+                                       element_size, k_inner);
+             } else {
+               return makeGemmABLayoutHopper(stride, continuous, continuous,
+                                             element_size, k_inner);
+             }
+           })
+      .def("tl.make_wgmma_swizzled_layout",
+           [](int stride, int mat_continuous, int continuity, int element_size,
+              bool k_inner) {
+             return makeGemmABLayoutHopper(stride, mat_continuous, continuity,
+                                           element_size, k_inner);
+           })
+      .def("tl.make_full_bank_swizzled_layout",
            [](int stride, int continuous, int element_size) {
-             return makeGemmABLayout(stride, continuous, continuous,
-                                     element_size, 0);
+             return makeFullBankSwizzleLayout(stride, continuous, element_size);
+           })
+      .def("tl.make_half_bank_swizzled_layout",
+           [](int stride, int continuous, int element_size) {
+             return makeHalfBankSwizzleLayout(stride, continuous, element_size);
+           })
+      .def("tl.make_quarter_bank_swizzled_layout",
+           [](int stride, int continuous, int element_size) {
+             return makeQuarterBankSwizzleLayout(stride, continuous,
+                                                 element_size);
+           })
+      .def("tl.make_linear_layout",
+           [](int stride, int continuous) {
+             return makeGemmLayoutLinear(stride, continuous);
            });
 });
 

diff --git a/src/layout/layout.h b/src/layout/layout.h
@@ -166,13 +166,13 @@ Fragment makeGemmFragmentACDNA(const int block_m, const int block_n,
 Layout makeGemmLayoutLinear(int stride, int continuous);
 Layout makeGemmABLayoutPadded(int stride, int continuous, int element_size);
 Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
-                        int element_size, int kfactor);
+                        int element_size, bool k_inner = true);
 Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
-                              int continuity, int element_size, int kfactor);
+                              int continuity, int element_size, bool k_inner = true);
 Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
-                             int element_size, int kfactor);
+                             int element_size, bool k_inner = true);
 Layout makeGemmABLayoutCDNA(int stride, int continuous, int element_size,
-                            int kfactor);
+                            int kPack);
 
 Fragment makeGemmVoltaFragmentC(const int block_m, const int block_n,
                                 const int warp_m, const int warp_n,
@@ -181,7 +181,7 @@ Fragment makeGemmVoltaFragmentA(const int block_m, const int block_n,
                                 const int block_k, const int warp_m,
                                 const int warp_n);
 Layout makeGemmVoltaABLayout(int stride, int continuous, bool is_a,
-                             int kfactor);
+                             bool k_inner = true);
 
 Layout makeTensorOpMultiplicand(int mat_stride, int mat_continuous,
                                 int elementsize, int crosswise);

diff --git a/src/op/builtin.cc b/src/op/builtin.cc
@@ -143,6 +143,16 @@ TIR_DEFINE_TL_BUILTIN(mbarrier_expect_tx)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_wgmma_ss)
+    .set_num_inputs(15)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_wgmma_rs)
+    .set_num_inputs(15)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(ptx_init_tensor_memory)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -239,5 +249,15 @@ TIR_DEFINE_TL_BUILTIN(tl_shuffle_elect)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(initialize_descriptor)
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(increase_descriptor_offset)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
@@ -216,21 +216,43 @@ TVM_DLL const Op &mbarrier_wait_parity();
  */
 TVM_DLL const Op &mbarrier_expect_tx();
 
+/*!
+ * \brief tvm intrinsic for ptx tensor core wgmma instructions.
+ *
+ *  void ptx_wgmma_ss(StringImm accum_dtype, StringImm wgmma_prefix, bool
+ * a_is_k_major, bool b_is_k_major, StringImm a_dtype_abbrv, StringImm b_dtype_abbrv,
+ * StringImm accum_dtype_abbrv, Var A_descriptor, PrimExpr A_offset, Var
+ * B_descriptor, Var B_offset, Var C_data, Var C_offset, bool scale_out, bool
+ * scale_in_a, bool scale_in_b);
+ */
+TVM_DLL const Op &ptx_wgmma_ss();
+
+/*!
+ * \brief tvm intrinsics for ptx tensor core wgmma instructions.
+ *
+ *  void ptx_wgmma_rs(StringImm accum_dtype, StringImm wgmma_prefix, bool
+ * a_is_k_major, bool b_is_k_major, StringImm a_dtype_abbrv, StringImm b_dtype_abbrv,
+ * StringImm accum_dtype_abbrv, Var A_descriptor, PrimExpr A_offset, Var
+ * B_descriptor, Var B_offset, Var C_data, Var C_offset, bool scale_out, bool
+ * scale_in_a, bool scale_in_b);
+ */
+TVM_DLL const Op &ptx_wgmma_rs();
+
 /*!
  * \brief tvm intrinsics for initializing tensor memory
  *
  * ptx_init_tensor_memory(tmem_buffer, num_cols)
  *
  */
-const Op &ptx_init_tensor_memory();
+TVM_DLL const Op &ptx_init_tensor_memory();
 
 /*!
  * \brief tvm intrinsics for deallocating tensor memory
  *
  * tmem_deallocate(tmem_buffer)
  *
  */
-const Op &ptx_deallocate_tensor_memory();
+TVM_DLL const Op &ptx_deallocate_tensor_memory();
 
 /*!
  * \brief tvm intrinsics for ldmatrix
@@ -398,6 +420,24 @@ TVM_DLL const Op &tl_gemm_sp();
  */
 TVM_DLL const Op &tl_shuffle_elect();
 
+/*!
+ * \brief tilelang intrinsic for initializing a descriptor buffer for
+ * wgmma/utcmma.
+ *
+ *  This op is used to represent a descriptor initialization operation in
+ * tilelang.
+ */
+TVM_DLL const Op &initialize_descriptor();
+
+/*!
+ * \brief tilelang intrinsic for setting the start address of a descriptor
+ * buffer for wgmma/utcmma.
+ *
+ *  This op is used to represent a descriptor start address setting operation in
+ * tilelang.
+ */
+TVM_DLL const Op &increase_descriptor_offset();
+
 } // namespace tl
 } // namespace tvm
 

diff --git a/src/op/gemm.cc b/src/op/gemm.cc
@@ -109,7 +109,7 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
  * @param vmap Mapping from access pointer vars to Buffer objects used to
  *   resolve the Buffer corresponding to each pointer argument.
  *
- * @note If `kPack` is provided it must be 1 or 2; otherwise the constructor
+ * @note If `kPack` is provided it must be 1; otherwise the constructor
  *       fails with an ICHECK (runtime assertion). No other validation is
  *       performed here.
  */
@@ -670,7 +670,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       int dim_A = A->shape.size();
       results.Set(A, makeGemmVoltaABLayout(*as_const_int(A->shape[dim_A - 2]),
                                            *as_const_int(A->shape[dim_A - 1]),
-                                           true, trans_A ? 1 : 2));
+                                           true, !trans_A));
     } else if (A.scope() == "local.fragment") {
       ICHECK(trans_A == false);
       auto fragment = makeGemmVoltaFragmentA(M, N, K, M / warp_m, N / warp_n);
@@ -683,7 +683,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
     int dim_B = B->shape.size();
     results.Set(B, makeGemmVoltaABLayout(*as_const_int(B->shape[dim_B - 2]),
                                          *as_const_int(B->shape[dim_B - 1]),
-                                         false, trans_B ? 2 : 1));
+                                         false, trans_B));
   } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
              TargetIsSM120(T.target) ||
              (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
@@ -700,7 +700,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       const int64_t mat_continuous = *as_const_int(A->shape[dim_A - 1]);
       results.Set(A,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   A->dtype.bits(), trans_A ? 1 : 2));
+                                   A->dtype.bits(), !trans_A));
     } else if (A.scope() == "local.fragment") {
       auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
                                         A->dtype.bits(), trans_A);
@@ -714,7 +714,7 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
       results.Set(B,
                   makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   B->dtype.bits(), trans_B ? 2 : 1));
+                                   B->dtype.bits(), trans_B));
     } else if (B.scope() == "local.fragment") {
       auto fragment =
           makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
@@ -741,9 +741,9 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       auto ABLayout =
           gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       A->dtype.bits(), trans_A ? 1 : 2)
+                                       A->dtype.bits(), !trans_A)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 A->dtype.bits(), trans_A ? 1 : 2);
+                                 A->dtype.bits(), !trans_A);
       results.Set(A, ABLayout);
     } else {
       auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
@@ -756,12 +756,14 @@ LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
       const int64_t mat_continuous = *as_const_int(B->shape[dim_B - 1]);
       const int64_t continuity =
           trans_B ? mat_continuous : mat_continuous / warp_n;
+
+    LOG(INFO) << "gemm_inst: " << (int)gemm_inst << ", trans_B: " << trans_B;
       auto ABLayout =
           gemm_inst == GemmInst::kWGMMA
               ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       B->dtype.bits(), trans_B ? 2 : 1)
+                                       B->dtype.bits(), trans_B)
               : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 B->dtype.bits(), trans_B ? 2 : 1);
+                                 B->dtype.bits(), trans_B);
       results.Set(B, ABLayout);
     } else {
       auto fragment =