triton-lang · antiagainst · May 29, 2026 · May 26, 2026 · jungpark-mlir · May 28, 2026
@@ -1,4 +1,6 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --check-prefix=GFX1250
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefix=GFX1250
+
+// GFX1250: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
 
 #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
 #smem = #ttg.shared_memory
@@ -12,8 +14,9 @@ module attributes {"ttg.target" = "hip:gfx1250", "ttg.num-ctas" = 1 : i32, "ttg.
     // GFX1250-NEXT:   llvm.store %[[INIT_VAL1]], %[[ALLOC_PTR]] : i64, !llvm.ptr<3>
     // GFX1250-NEXT:   llvm.br ^[[BB1]]
     // GFX1250-NEXT: ^[[BB1]]:
-    // GFX1250-NEXT:   rocdl.s.wait.dscnt 0
+    // GFX1250-NEXT:   llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
     // GFX1250-NEXT:   rocdl.s.barrier
+    // GFX1250-NEXT:   llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
     // GFX1250-NEXT:   llvm.return
     amdg.init_barrier %alloc, 2 : !ttg.memdesc<1xi64, #shared, #smem, mutable>
     tt.return

@@ -1,7 +1,39 @@
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=CHECK,COMMON
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --check-prefixes=GFX950,COMMON
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --check-prefixes=GFX1250,COMMON
-// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --check-prefixes=GFX906,COMMON
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefixes=CHECK,COMMON
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --enable-var-scope --check-prefixes=GFX950,COMMON
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --enable-var-scope --check-prefixes=GFX1250,COMMON
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --enable-var-scope --check-prefixes=GFX906,COMMON
+
+// COMMON-DAG: [[$LOCAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
+// COMMON-DAG: [[$GLOBAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"global">
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+  // COMMON-LABEL: lower_barrier
+  tt.func @lower_barrier() {
+    // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
+    ttg.barrier local
+
+    // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
+    ttg.barrier global_read
+
+    // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]}
+    ttg.barrier global_write
+
+    // COMMON: llvm.fence syncscope("workgroup") release{{$}}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire{{$}}
+    ttg.barrier local|global_read|global_write
+
+    tt.return
+  }
+}
+
+// -----
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: atomic_add_f32_scalar
@@ -14,8 +46,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK: llvm.atomicrmw
     // CHECK: llvm.store
     // CHECK: llvm.br
-    // CHECK: rocdl.s.waitcnt 49279
-    // CHECK: rocdl.s.barrier
+    // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
     // CHECK: llvm.load
     // CHECK: llvm.store
     %0 = tt.atomic_rmw fadd, relaxed, gpu, %arg0, %arg2, %arg1 : (!tt.ptr<f32>, f32, i1) -> f32
@@ -609,8 +642,9 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     // CHECK-NOT: llvm.store
     %0 = ttg.local_alloc %arg0 : (tensor<32x32xf16, #blocked>) -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable>
     %1 = ttg.memdesc_subslice %0 [16, 0]  : !ttg.memdesc<32x32xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32>
-    // CHECK: rocdl.s.waitcnt
-    // CHECK-NEXT: rocdl.s.barrier
+    // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
+    // COMMON-NEXT: rocdl.s.barrier
+    // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]}
     // CHECK: %[[AFF_I8:.+]] = llvm.mul %{{.+}}, %[[SUBSLICE_CST2]] : i32
     // CHECK-NEXT: %[[AFF_SHR:.+]] = llvm.lshr %[[AFF_I8]], %[[SUBSLICE_CST6]] : i32
     // CHECK-NEXT: %[[AFF_SHL:.+]] = llvm.shl %[[AFF_SHR]], %[[SUBSLICE_CST3]] : i32

@@ -1,4 +1,6 @@
-// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s --enable-var-scope
+
+// CHECK: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local">
 
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}>
@@ -23,8 +25,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   // CHECK: ttg.async_commit_group
   // CHECK: rocdl.sched.barrier 0
   // CHECK-NEXT: rocdl.s.setprio 0
-  // CHECK-NEXT: amdg.memory_counter_wait ds(0)
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: rocdl.s.barrier
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: rocdl.sched.barrier 0
   // Compute Cluster2
   // CHECK: tt.dot
@@ -38,7 +41,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   // CHECK: ttg.async_commit_group
   // CHECK: rocdl.sched.barrier 0
   // CHECK-NEXT: rocdl.s.setprio 0
-  // CHECK-NEXT: amdg.memory_counter_wait ds(0)
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: scf.yield
 
   tt.func @chained_dots_async_loads(%arg0: tensor<64x16x!tt.ptr<f16>, #blocked>, %arg1: i32, %arg2: i32, %arg3: !ttg.async.token, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> {
@@ -95,8 +98,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   // CHECK: tt.load
   // CHECK-NEXT: rocdl.sched.barrier 0
   // CHECK-NEXT: rocdl.s.setprio 0
-  // CHECK-NEXT: amdg.memory_counter_wait ds(0)
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: rocdl.s.barrier
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: rocdl.sched.barrier 0
   // Compute Cluster2
   // CHECK: tt.dot
@@ -109,7 +113,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   // CHECK: tt.load
   // CHECK-NEXT: rocdl.sched.barrier 0
   // CHECK-NEXT: rocdl.s.setprio 0
-  // CHECK-NEXT: amdg.memory_counter_wait ds(0)
+  // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]}
   // CHECK-NEXT: scf.yield
 
   tt.func @chained_dots_tt_loads(%arg0: tensor<64x16xf16, #blocked>, %arg1: tensor<64x16x!tt.ptr<f16>, #blocked>, %arg2: i32, %arg3: i32, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> {

@@ -1,6 +1,7 @@
 #include "AsyncUtility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "PatternTritonGPUOpToLLVM.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
@@ -13,6 +14,20 @@ using mlir::triton::amdgpu::ISAFamily;
 using ::mlir::triton::gpu::MemDescType;
 
 namespace {
+
+static LLVM::FenceOp createAMDGPUMemoryFence(OpBuilder &builder, Location loc,
+                                             LLVM::AtomicOrdering ordering,
+                                             StringRef synchronizeAddrSpace) {
+  auto fence =
+      LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup");
+  if (!synchronizeAddrSpace.empty()) {
+    Attribute mmra = builder.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as",
+                                                        synchronizeAddrSpace);
+    fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+  }
+  return fence;
+}
+
 class TransLocalLoadOpConversion
     : public ConvertOpToLLVMPattern<triton::gpu::LocalLoadOp> {
 public:
@@ -549,21 +564,27 @@ class BarrierOpConversion
                 triton::gpu::AddrSpace::TensorWrite;
     if ((op.getAddrSpace() & ~mask) != triton::gpu::AddrSpace::None)
       return failure();
-    // We can lower barrier to MemoryCounterWaitOp + s_barrier
-    // - MemoryCounterWaitOp specifies how many operations to
-    //   VMEM(Read)/VMEM(Write)/LDS can be outstanding when
-    //   the instruction completes.
-    // - s_barrier synchronizes the execution for the CTA
-    IntegerAttr zero = rewriter.getI32IntegerAttr(0);
     bool localBarrier = op.hasLocal();
     bool globalBarrier = op.hasGlobalRead() || op.hasGlobalWrite();
     if (localBarrier || globalBarrier) {
-      amdgpu::MemoryCounterWaitOp::create(
-          rewriter, op->getLoc(),
-          /* load= */ op.hasGlobalRead() ? zero : nullptr,
-          /* store= */ op.hasGlobalWrite() ? zero : nullptr,
-          /* ds= */ localBarrier ? zero : nullptr);
+      StringRef mmraAddrSpace = "";
+      if (localBarrier && !globalBarrier)
+        mmraAddrSpace = "local";
+      else if (!localBarrier && globalBarrier)
+        mmraAddrSpace = "global";
+
+      // Local/global barriers use LLVM fences so the AMDGPU memory legalizer
+      // selects target-specific waits. Mixed local+global barriers are left
+      // untagged so LLVM conservatively synchronizes every relevant space.
+      createAMDGPUMemoryFence(rewriter, op->getLoc(),
+                              LLVM::AtomicOrdering::release, mmraAddrSpace);
+      ROCDL::SBarrierOp::create(rewriter, op->getLoc());
+      createAMDGPUMemoryFence(rewriter, op->getLoc(),
+                              LLVM::AtomicOrdering::acquire, mmraAddrSpace);
+      rewriter.eraseOp(op);
+      return success();
     }
+
     rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
 
     return success();

@@ -1,6 +1,7 @@
 #include "TritonAMDGPUTransforms/Passes.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinAttributes.h"
@@ -27,6 +28,16 @@ namespace mlir {
 
 namespace {
 
+static LLVM::FenceOp createLocalMMRAFence(OpBuilder &builder, Location loc,
+                                          LLVM::AtomicOrdering ordering) {
+  Attribute mmra =
+      builder.getAttr<LLVM::MMRATagAttr>("amdgpu-synchronize-as", "local");
+  auto fence =
+      LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup");
+  fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra);
+  return fence;
+}
+
 // This pass transforms a for-loop calculating a GEMM. Main purpose of the
 // transform is improve the efficiency of the GPU dot instruction (mfma)
 // by interleaving the execution of two warps on each SIMD. Especially it groups
@@ -721,28 +732,30 @@ LogicalResult Pingponger::transformTwoClusterWithAsyncAndAll(OpBuilder &builder,
 // Typical `s_xxx` instructions include:
 //   - Control flow: `s_cbranch`
 //   - Priority control: `s_setprio`
-//   - Synchronization and dependency: `s_waitcnt`
+//   - Synchronization and dependency: MMRA-tagged local fences, lowered by LLVM
+//     to target-specific wait instructions.
 //
 // These are usually inserted near `s_barrier` boundaries, and the current
 // implementation carefully places them to ensure they belong to the memory
 // cluster, improving overall overlap and utilization.
 //
 //
-// 3. Placement of `s_waitcnt lgkmcnt(0)`
-// --------------------------------------
-// We place `s_waitcnt lgkmcnt(0)` at the *end* of the memory cluster to ensure
-// that all shared-memory load (`ds_read`) instructions have completed before
-// entering the compute cluster.
+// 3. Placement of local MMRA fences
+// ---------------------------------
+// We place local MMRA release fences at the *end* of the memory cluster to
+// ensure that all shared-memory load (`ds_read`) instructions have completed
+// before entering the compute cluster. LLVM lowers these fences to the
+// appropriate target-specific wait instructions.
 //
 // This placement prevents the LLVM backend from inserting additional
-// `s_waitcnt lgkmcnt()` instructions inside the compute cluster based on
+// wait instructions inside the compute cluster based on
 // inferred dependencies between `mfma` and `ds_read` operations.
 //
 // This approach is consistent with the previous design goal: to eliminate all
 // `s_xxx` instructions from the compute cluster so it can run uninterrupted
-// MFMA and VALU operations. Keeping `s_waitcnt lgkmcnt(0)` at the cluster
-// boundary enforces data dependency correctness while preserving the clean
-// separation between memory and compute phases.
+// MFMA and VALU operations. Keeping the local fence at the cluster boundary
+// enforces data dependency correctness while preserving the clean separation
+// between memory and compute phases.
 LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
                                                       Location loc) {
   assert(dotOps.size() == 2);
@@ -785,30 +798,29 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
   // Ideally we want the memory cluster to start with
   //
   // s_barrier
-  // s_waitcnt vmcnt(x) lgkmcnt(0)
+  // local wait
   // s_setprio 1
   //
-  // However, the membar pass will put s_waitcnt before s_barrier.
+  // However, the membar path will put the local MMRA fence before s_barrier.
   // But we can at least put s_setprio in the memory cluster.
   prependOp(ROCDL::SetPrioOp::create(builder, loc, highPriority), false);
 
   // ComputeCluster 2
   // We want the 2nd compute cluster to start with
   //
   // s_setprio 0
-  // s_waitcnt lgkmcnt(0)
+  // local MMRA release fence
   // s_barrier
   //
   // Check note 2 and 3 for details.
   updateOpInsertion(dotOps[1]);
   prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);
   prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false);
-  auto dsAttr = builder.getI32IntegerAttr(0);
-  prependOp(tt::amdgpu::MemoryCounterWaitOp::create(
-                builder, loc, /* load= */ nullptr, /* store= */ nullptr,
-                /* ds= */ dsAttr),
+  prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release),
             false);
   prependOp(ROCDL::SBarrierOp::create(builder, loc), false);
+  prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::acquire),
+            false);
   prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);
 
   // MemoryCluster2
@@ -828,7 +840,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
   // stays in the memory cluster.
   //
   // s_setprio 0
-  // s_waitcnt lgkmcnt(0)
+  // local MMRA release fence
   // s_cbranch
   // s_barrier
   //
@@ -840,9 +852,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder,
   updateOpInsertion(lastInsertedOp->getBlock()->getTerminator());
   prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false);
   prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false);
-  prependOp(tt::amdgpu::MemoryCounterWaitOp::create(
-                builder, loc, /* load= */ nullptr, /* store= */ nullptr,
-                /* ds= */ dsAttr),
+  prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release),
             false);
 
   return success();