tile-ai · Rachmanino · Feb 2, 2026 · Feb 2, 2026 · Feb 2, 2026 · Feb 3, 2026
diff --git a/examples/gemm_sm100/gemm_tcgen5mma_ws.py b/examples/gemm_sm100/gemm_tcgen5mma_ws.py
@@ -0,0 +1,91 @@
+# Non-persistent, 1-SM GEMM
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit
+def gemm(A, B, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, use_tma_store=True):
+    M, N, K = T.const("M, N, K")
+
+    k_iters = T.ceildiv(K, block_K)
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N), in_dtype)
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        loaded = T.alloc_barrier([32] * num_stages)
+        consumed = T.alloc_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+
+        T.use_swizzle(8)
+
+        if tx < 32:  # warp 0: issue tma
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(consumed[k % num_stages], ((k // num_stages) & 1) ^ 1)
+                T.copy(A[by * block_M : (by + 1) * block_M, k * block_K : (k + 1) * block_K], A_shared[k % num_stages, :, :])
+                T.copy(B[k * block_K : (k + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[k % num_stages, :, :])
+                T.mbarrier_arrive(loaded[k % num_stages])
+        elif tx < 64:  # warp 1: issue tcgen5
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(loaded[k % num_stages], (k // num_stages) & 1)
+                T.gemm(
+                    A_shared[k % num_stages, :, :],
+                    B_shared[k % num_stages, :, :],
+                    C_tmem,
+                    mbar=consumed[k % num_stages],
+                    wg_wait=-1,
+                    clear_accum=k == 0,
+                )
+            T.tcgen05_mma_arrive(tmem_full)
+
+        # Wait for all tcgen5 to finish
+        T.mbarrier_wait_parity(tmem_full, 0)
+
+        T.sync_threads()  # TileLang won't generate this if not annotated
+        T.copy(C_tmem, C_local)
+        if use_tma_store:
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+        else:
+            T.copy(C_local, C_local_cast)
+            T.copy(C_local_cast, C[by * block_M, bx * block_N])
+    return C
+
+
+def main():
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 64
+    in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+    num_stages = 4
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    c = gemm(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages)
+    print(gemm.get_kernel_source(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages))
+
+    ref_c = (a.to(torch.float) @ b.to(torch.float)).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All checks passed. ✅")
+
+    tl_latency = do_bench(lambda: gemm(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages), backend="cupti")
+    torch_latency = do_bench(lambda: a @ b, backend="cupti")
+    print(f"Tilelang latency: {tl_latency} ms")
+    print(f"Flops: {2 * M * N * K / (tl_latency / 1e3) / 1e12} TFLOPS")
+    print(f"Torch latency: {torch_latency} ms")
+    print(f"Flops: {2 * M * N * K / (torch_latency / 1e3) / 1e12} TFLOPS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sm100/gemm_tcgen5mma_ws_persistent.py b/examples/gemm_sm100/gemm_tcgen5mma_ws_persistent.py
@@ -0,0 +1,154 @@
+# Persistent, 1-SM, num_epi_stages = 2
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.carver.arch import driver
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_LOOP_UNSWITCHING: True})
+def gemm(
+    A,
+    B,
+    block_M,
+    block_N,
+    store_block_N,  # block_N for C_shared
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    use_tma_store=True,
+):
+    M, N, K = T.const("M, N, K")
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    sm_num = driver.get_num_sms()
+    m_blocks = T.ceildiv(M, block_M)
+    n_blocks = T.ceildiv(N, block_N)
+    assert K % (2 * block_K) == 0  # for simplicity
+    k_blocks = T.ceildiv(K, block_K)
+    waves = T.ceildiv(m_blocks * n_blocks, sm_num)
+    group_size = 8
+
+    with T.Kernel(sm_num, threads=256) as (block_id):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N), in_dtype)
+        C_tmem_0 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_tmem_1 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+        loaded = T.alloc_barrier([32] * num_stages)
+        consumed = T.alloc_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1] * 2)
+        tmem_empty = T.alloc_barrier([128] * 2)
+
+        tx = T.get_thread_binding()
+
+        if tx < 32:  # warp 0: issue tma
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    for k in T.serial(k_blocks):
+                        T.mbarrier_wait_parity(consumed[k % num_stages], ((k // num_stages) & 1) ^ 1)
+                        T.copy(
+                            A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K], A_shared[k % num_stages, :, :]
+                        )  # cannot use BufferLoad here
+                        T.copy(B[k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N], B_shared[k % num_stages, :, :])
+                        T.mbarrier_arrive(loaded[k % num_stages])
+
+        elif tx < 64:  # warp 1: issue tcgen5
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_empty[w & 1], ((w // 2) & 1) ^ 1)
+                    for k in T.serial(k_blocks):
+                        T.mbarrier_wait_parity(loaded[k % num_stages], (k // num_stages) & 1)
+                        if w & 1 == 0:
+                            T.gemm(
+                                A_shared[k % num_stages, :, :],
+                                B_shared[k % num_stages, :, :],
+                                C_tmem_0,
+                                False,
+                                False,
+                                mbar=consumed[k % num_stages],
+                                wg_wait=-1,
+                                clear_accum=k == 0,
+                            )
+                        else:
+                            T.gemm(
+                                A_shared[k % num_stages, :, :],
+                                B_shared[k % num_stages, :, :],
+                                C_tmem_1,
+                                False,
+                                False,
+                                mbar=consumed[k % num_stages],
+                                wg_wait=-1,
+                                clear_accum=k == 0,
+                            )
+                    T.tcgen05_mma_arrive(tmem_full[w & 1])
+
+        elif 128 <= tx < 256:  # warp 4~7: epilogue
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_full[w & 1], (w // 2) & 1)
+                    T.sync_threads(1, 128)
+                    if (w & 1) == 0:
+                        T.copy(C_tmem_0, C_local)
+                    else:
+                        T.copy(C_tmem_1, C_local)
+                    T.mbarrier_arrive(tmem_empty[w & 1])
+
+                    if use_tma_store:
+                        for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                            T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                            T.copy(C_shared, C[bx * block_M, by * block_N + i * store_block_N])
+                    else:
+                        T.copy(C_local, C_local_cast)
+                        T.copy(C_local_cast, C[bx * block_M, by * block_N])
+    return C
+
+
+def main():
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 64
+    store_block_N = 128
+    in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+    num_stages = 4
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    print(gemm.get_kernel_source(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages))
+    c = gemm(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages)
+
+    ref_c = (a.to(torch.float) @ b.to(torch.float)).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All checks passed. ✅")
+
+    tl_latency = do_bench(
+        lambda: gemm(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages), backend="cupti"
+    )
+    torch_latency = do_bench(lambda: a @ b, backend="cupti")
+    print(f"Tilelang latency: {tl_latency} ms")
+    print(f"Flops: {2 * M * N * K / (tl_latency / 1e3) / 1e12} TFLOPS")
+    print(f"Torch latency: {torch_latency} ms")
+    print(f"Flops: {2 * M * N * K / (torch_latency / 1e3) / 1e12} TFLOPS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
@@ -82,11 +82,7 @@ Gemm::Gemm(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   }
   if (args.size() > 16) {
     if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-    } else {
-      node->mbar_ = std::nullopt;
+      node->mbar_ = Downcast<BufferLoad>(args[16]);
     }
-  if (args.size() > 16) {
-    if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-      node->mbar_ = Downcast<BufferLoad>(args[16]);
-    } else {
-      node->mbar_ = std::nullopt;
-    }
+  if (args.size() > 16) {
+    const auto *load = args[16].as<BufferLoadNode>();
+    ICHECK(load || !args[16].defined())
+        << "mbar must be a BufferLoad or null when provided";
+    node->mbar_ =
+        load ? std::optional<BufferLoad>(Downcast<BufferLoad>(args[16]))
+             : std::nullopt;
+  }
-  if (args.size() > 16) {
-    if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-      node->mbar_ = Downcast<BufferLoad>(args[16]);
-    } else {
-      node->mbar_ = std::nullopt;
-    }
+  if (args.size() > 16) {
+    const auto *load = args[16].as<BufferLoadNode>();
+    ICHECK(load || !args[16].defined())
+        << "mbar must be a BufferLoad or null when provided";
+    node->mbar_ =
+        load ? std::optional<BufferLoad>(Downcast<BufferLoad>(args[16]))
+             : std::nullopt;
+  }
   }
   node->cCoords_ = Array<PrimExpr>(
@@ -461,7 +457,7 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     ICHECK(can_use_tcgen5mma);
     ICHECK(b_.scope() == "shared.dyn" || b_.scope() == "shared");
     ICHECK(c_.scope() == "shared.tmem");
-    ICHECK(mbar_.has_value()) << "mbar must be provided for TCGEN5MMA";
+    ICHECK(mbar_.defined()) << "mbar must be provided for TCGEN5MMA";
     if (a_.scope() == "shared.tmem") {
       op_name = "tl::tcgen5mma_gemm_ts";
     } else if (a_.scope() == "shared.dyn" || a_.scope() == "shared") {
@@ -492,8 +488,7 @@ Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
     auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
     Array<PrimExpr> new_args;
-    auto mbarPtr =
-        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
+    auto mbarPtr = MakeAccessPtrFromBufferLoad(mbar_, /*rw*/ 3);
     new_args.push_back(StringImm(ss.str()));
     new_args.push_back(Aptr);
     new_args.push_back(Bptr);

diff --git a/src/op/gemm.h b/src/op/gemm.h
@@ -130,8 +130,7 @@ class GemmNode : public TileOperatorNode {
   // only will be enabled under cdna mfma instructions
   int kPack_ = 1;
   int wgWait_ = 0;
-  BufferRegion mbarRegion_;
-  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
+  tir::BufferLoad mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   mutable GemmWarpPolicy policy_;
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Gemm", GemmNode, TileOperatorNode);
@@ -157,6 +156,8 @@ class GemmNode : public TileOperatorNode {
         .def_ro("clearAccum", &GemmNode::clearAccum_)
         .def_ro("kPack", &GemmNode::kPack_)
         .def_ro("wgWait", &GemmNode::wgWait_)
+        .def_ro("mbar", &GemmNode::mbar_)
+        .def_ro("cCoords", &GemmNode::cCoords_)
         .def_ro("policy", &GemmNode::policy_);
   }
 

diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
@@ -82,9 +82,7 @@ GemmPy::GemmPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   }
   if (args.size() > 16) {
     if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
+      node->mbar_ = Downcast<BufferLoad>(args[16]);
     }
-  if (args.size() > 16) {
-    if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-      node->mbar_ = Downcast<BufferLoad>(args[16]);
-    }
+  if (args.size() > 16) {
+    ICHECK(args[16].as<BufferLoadNode>())
+        << "mbar must be provided as a BufferLoad when present";
+    node->mbar_ = Downcast<BufferLoad>(args[16]);
+  }
-  if (args.size() > 16) {
-    if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-      node->mbar_ = Downcast<BufferLoad>(args[16]);
-    }
+  if (args.size() > 16) {
+    ICHECK(args[16].as<BufferLoadNode>())
+        << "mbar must be provided as a BufferLoad when present";
+    node->mbar_ = Downcast<BufferLoad>(args[16]);
+  }
   }
   node->cCoords_ = Array<PrimExpr>(

diff --git a/src/op/gemm_py.h b/src/op/gemm_py.h
@@ -29,8 +29,7 @@ class GemmPyNode : public TileOperatorNode {
   int strideA_, strideB_;
   int offsetA_, offsetB_;
   PrimExpr clearAccum_ = const_false();
-  BufferRegion mbarRegion_;
-  tir::Buffer mbar_; // mbar is optional, only used for TCGEN5MMA
+  tir::BufferLoad mbar_; // mbar is optional, only used for TCGEN5MMA
   Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
@@ -59,7 +58,6 @@ class GemmPyNode : public TileOperatorNode {
         .def_ro("offsetA", &GemmPyNode::offsetA_)
         .def_ro("offsetB", &GemmPyNode::offsetB_)
         .def_ro("clearAccum", &GemmPyNode::clearAccum_)
-        .def_ro("mbarRegion", &GemmPyNode::mbarRegion_)
         .def_ro("mbar", &GemmPyNode::mbar_)
         .def_ro("cCoords", &GemmPyNode::cCoords_)
         .def_ro("kPack", &GemmPyNode::kPack_)

diff --git a/src/op/utils.cc b/src/op/utils.cc
@@ -103,6 +103,36 @@ PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
   return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
 }
 
+PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
+  Buffer buf = load->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+
+  // Compute offset using row-major layout (iterate in reverse)
+  PrimExpr offset = 0;
+  PrimExpr stride = 1;
+
+  for (int i = ndim - 1; i >= 0; --i) {
+    const PrimExpr &index = load->indices[i];
+    if (const auto *ramp = index.as<RampNode>()) {
+      // For Ramp, use the base
+      offset = offset + ramp->base * stride;
+    } else {
+      // For scalar index (IntImm or other PrimExpr)
+      offset = offset + index * stride;
+    }
+    stride = stride * buf->shape[i];
+  }
+
+  // Extent is 1 element for a single BufferLoad access
+  PrimExpr extent = make_const(DataType::Int(32), 1);
+
+  // Build access_ptr
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
-PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
-  Buffer buf = load->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-
-  // Compute offset using row-major layout (iterate in reverse)
-  PrimExpr offset = make_const(DataType::Int(32), 0);
-  PrimExpr stride = make_const(DataType::Int(32), 1);
-
-  for (int i = ndim - 1; i >= 0; --i) {
-    const PrimExpr &index = load->indices[i];
-    if (const auto *ramp = index.as<RampNode>()) {
-      // For Ramp, use the base
-      offset = offset + ramp->base * stride;
-    } else {
-      // For scalar index (IntImm or other PrimExpr)
-      offset = offset + index * stride;
-    }
-    stride = stride * buf->shape[i];
-  }
-
-  // Extent is 1 element for a single BufferLoad access
-  PrimExpr extent = make_const(DataType::Int(32), 1);
-
-  // Build access_ptr
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
+  Buffer buf = load->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+
+  // Compute offset using row-major layout (iterate in reverse)
+  DataType idx_dtype = buf->shape[0].dtype();
+  PrimExpr offset = make_const(idx_dtype, 0);
+  PrimExpr stride = make_const(idx_dtype, 1);
+
+  for (int i = ndim - 1; i >= 0; --i) {
+    const PrimExpr &index = load->indices[i];
+    if (const auto *ramp = index.as<RampNode>()) {
+      // For Ramp, use the base
+      offset = offset + ramp->base * stride;
+    } else {
+      // For scalar index (IntImm or other PrimExpr)
+      offset = offset + index * stride;
+    }
+    stride = stride * buf->shape[i];
+  }
+
+  // Extent is 1 element for a single BufferLoad access
+  PrimExpr extent = make_const(idx_dtype, 1);
+
+  // Build access_ptr
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
-PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
-  Buffer buf = load->buffer;
-  int ndim = static_cast<int>(buf->shape.size());
-
-  // Compute offset using row-major layout (iterate in reverse)
-  PrimExpr offset = make_const(DataType::Int(32), 0);
-  PrimExpr stride = make_const(DataType::Int(32), 1);
-
-  for (int i = ndim - 1; i >= 0; --i) {
-    const PrimExpr &index = load->indices[i];
-    if (const auto *ramp = index.as<RampNode>()) {
-      // For Ramp, use the base
-      offset = offset + ramp->base * stride;
-    } else {
-      // For scalar index (IntImm or other PrimExpr)
-      offset = offset + index * stride;
-    }
-    stride = stride * buf->shape[i];
-  }
-
-  // Extent is 1 element for a single BufferLoad access
-  PrimExpr extent = make_const(DataType::Int(32), 1);
-
-  // Build access_ptr
-  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
-  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
-                           IntImm(DataType::Int(32), rw_mask)};
-  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
+  Buffer buf = load->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+
+  // Compute offset using row-major layout (iterate in reverse)
+  DataType idx_dtype = buf->shape[0].dtype();
+  PrimExpr offset = make_const(idx_dtype, 0);
+  PrimExpr stride = make_const(idx_dtype, 1);
+
+  for (int i = ndim - 1; i >= 0; --i) {
+    const PrimExpr &index = load->indices[i];
+    if (const auto *ramp = index.as<RampNode>()) {
+      // For Ramp, use the base
+      offset = offset + ramp->base * stride;
+    } else {
+      // For scalar index (IntImm or other PrimExpr)
+      offset = offset + index * stride;
+    }
+    stride = stride * buf->shape[i];
+  }
+
+  // Extent is 1 element for a single BufferLoad access
+  PrimExpr extent = make_const(idx_dtype, 1);
+
+  // Build access_ptr
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+}
+
 // Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
 int to_CUtensorMapDataType(DataType dtype) {
   CUtensorMapDataType tp;

diff --git a/src/op/utils.h b/src/op/utils.h
@@ -43,6 +43,10 @@ TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
 TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
                                          int rw_mask, bool require_2d = false);
 
+// Build a tvm_access_ptr(handle) from a BufferLoad.
+TVM_DLL PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load,
+                                             int rw_mask);
+
 // Check if a buffer is a fragment buffer (scope == "local.fragment")
 inline bool IsFragmentBuffer(const Buffer &buffer) {
   return buffer.defined() && buffer.scope() == "local.fragment";

diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
@@ -805,15 +805,17 @@ def cp_async_barrier_noinc(barrier: BarrierType):
     return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier)
 
 
-def tcgen05_mma_arrive(mbar_ptr):
+def tcgen05_mma_arrive(mbar: tir.Buffer | BufferLoad | PrimExpr):
     """Signal UMMA (TCGEN05) barrier arrival for a shared-memory mbarrier pointer.
 
     Parameters
     ----------
-    mbar_ptr : PrimExpr
-        Pointer to the mbarrier object in shared memory (e.g., Barrier*).
+    mbar: tir.Buffer | BufferLoad | PrimExpr
+        The mbarrier object in shared memory (e.g., Barrier*) or its address.
     """
-    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_mma_arrive"), mbar_ptr)
+    if isinstance(mbar, (tir.Buffer, BufferLoad)):
+        mbar = retrieve_ptr(mbar, access_type="rw")
+    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_mma_arrive"), mbar)
 
 
 def ptx_mma_sm70(