diff --git a/test/Conversion/amd/mbarrier_ops_to_llvm_gfx1250.mlir b/test/Conversion/amd/mbarrier_ops_to_llvm_gfx1250.mlir index 64c22ff9bcf8..b08a8edde976 100644 --- a/test/Conversion/amd/mbarrier_ops_to_llvm_gfx1250.mlir +++ b/test/Conversion/amd/mbarrier_ops_to_llvm_gfx1250.mlir @@ -1,4 +1,6 @@ -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --check-prefix=GFX1250 +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefix=GFX1250 + +// GFX1250: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> #shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> #smem = #ttg.shared_memory @@ -12,8 +14,9 @@ module attributes {"ttg.target" = "hip:gfx1250", "ttg.num-ctas" = 1 : i32, "ttg. // GFX1250-NEXT: llvm.store %[[INIT_VAL1]], %[[ALLOC_PTR]] : i64, !llvm.ptr<3> // GFX1250-NEXT: llvm.br ^[[BB1]] // GFX1250-NEXT: ^[[BB1]]: - // GFX1250-NEXT: rocdl.s.wait.dscnt 0 + // GFX1250-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]} // GFX1250-NEXT: rocdl.s.barrier + // GFX1250-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]} // GFX1250-NEXT: llvm.return amdg.init_barrier %alloc, 2 : !ttg.memdesc<1xi64, #shared, #smem, mutable> tt.return diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir index bd918c99376f..9113bfc97735 100644 --- a/test/Conversion/amd/tritongpu_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_to_llvm.mlir @@ -1,7 +1,39 @@ -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --check-prefixes=CHECK,COMMON -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --check-prefixes=GFX950,COMMON -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --check-prefixes=GFX1250,COMMON -// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --check-prefixes=GFX906,COMMON +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s --enable-var-scope --check-prefixes=CHECK,COMMON +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx950 | FileCheck %s --enable-var-scope --check-prefixes=GFX950,COMMON +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx1250 | FileCheck %s --enable-var-scope --check-prefixes=GFX1250,COMMON +// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=gfx-arch=gfx906 | FileCheck %s --enable-var-scope --check-prefixes=GFX906,COMMON + +// COMMON-DAG: [[$LOCAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> +// COMMON-DAG: [[$GLOBAL_MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"global"> + +module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { + // COMMON-LABEL: lower_barrier + tt.func @lower_barrier() { + // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]} + ttg.barrier local + + // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]} + ttg.barrier global_read + + // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$GLOBAL_MMRA_TAG]]} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$GLOBAL_MMRA_TAG]]} + ttg.barrier global_write + + // COMMON: llvm.fence syncscope("workgroup") release{{$}} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire{{$}} + ttg.barrier local|global_read|global_write + + tt.return + } +} + +// ----- module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK-LABEL: atomic_add_f32_scalar @@ -14,8 +46,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} { // CHECK: llvm.atomicrmw // CHECK: llvm.store // CHECK: llvm.br - // CHECK: rocdl.s.waitcnt 49279 - // CHECK: rocdl.s.barrier + // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]} // CHECK: llvm.load // CHECK: llvm.store %0 = tt.atomic_rmw fadd, relaxed, gpu, %arg0, %arg2, %arg1 : (!tt.ptr, f32, i1) -> f32 @@ -609,8 +642,9 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n // CHECK-NOT: llvm.store %0 = ttg.local_alloc %arg0 : (tensor<32x32xf16, #blocked>) -> !ttg.memdesc<32x32xf16, #shared, #smem, mutable> %1 = ttg.memdesc_subslice %0 [16, 0] : !ttg.memdesc<32x32xf16, #shared, #smem, mutable> -> !ttg.memdesc<16x32xf16, #shared, #smem, mutable, 32x32> - // CHECK: rocdl.s.waitcnt - // CHECK-NEXT: rocdl.s.barrier + // COMMON: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$LOCAL_MMRA_TAG]]} + // COMMON-NEXT: rocdl.s.barrier + // COMMON-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$LOCAL_MMRA_TAG]]} // CHECK: %[[AFF_I8:.+]] = llvm.mul %{{.+}}, %[[SUBSLICE_CST2]] : i32 // CHECK-NEXT: %[[AFF_SHR:.+]] = llvm.lshr %[[AFF_I8]], %[[SUBSLICE_CST6]] : i32 // CHECK-NEXT: %[[AFF_SHL:.+]] = llvm.shl %[[AFF_SHR]], %[[SUBSLICE_CST3]] : i32 diff --git a/test/TritonGPU/amd/amd-block-pingpong-chained-dots.mlir b/test/TritonGPU/amd/amd-block-pingpong-chained-dots.mlir index 7cac545e040c..c98e95c24d79 100644 --- a/test/TritonGPU/amd/amd-block-pingpong-chained-dots.mlir +++ b/test/TritonGPU/amd/amd-block-pingpong-chained-dots.mlir @@ -1,4 +1,6 @@ -// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s +// RUN: triton-opt %s -split-input-file --tritonamdgpu-block-pingpong="num-stages=4" | FileCheck %s --enable-var-scope + +// CHECK: [[$MMRA_TAG:#[A-Za-z0-9_]+]] = #llvm.mmra_tag<"amdgpu-synchronize-as":"local"> #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> #mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32, 16], isTransposed = true}> @@ -23,8 +25,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK: ttg.async_commit_group // CHECK: rocdl.sched.barrier 0 // CHECK-NEXT: rocdl.s.setprio 0 - // CHECK-NEXT: amdg.memory_counter_wait ds(0) + // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: rocdl.s.barrier + // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: rocdl.sched.barrier 0 // Compute Cluster2 // CHECK: tt.dot @@ -38,7 +41,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK: ttg.async_commit_group // CHECK: rocdl.sched.barrier 0 // CHECK-NEXT: rocdl.s.setprio 0 - // CHECK-NEXT: amdg.memory_counter_wait ds(0) + // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: scf.yield tt.func @chained_dots_async_loads(%arg0: tensor<64x16x!tt.ptr, #blocked>, %arg1: i32, %arg2: i32, %arg3: !ttg.async.token, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> { @@ -95,8 +98,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK: tt.load // CHECK-NEXT: rocdl.sched.barrier 0 // CHECK-NEXT: rocdl.s.setprio 0 - // CHECK-NEXT: amdg.memory_counter_wait ds(0) + // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: rocdl.s.barrier + // CHECK-NEXT: llvm.fence syncscope("workgroup") acquire {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: rocdl.sched.barrier 0 // Compute Cluster2 // CHECK: tt.dot @@ -109,7 +113,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK: tt.load // CHECK-NEXT: rocdl.sched.barrier 0 // CHECK-NEXT: rocdl.s.setprio 0 - // CHECK-NEXT: amdg.memory_counter_wait ds(0) + // CHECK-NEXT: llvm.fence syncscope("workgroup") release {llvm.mmra = [[$MMRA_TAG]]} // CHECK-NEXT: scf.yield tt.func @chained_dots_tt_loads(%arg0: tensor<64x16xf16, #blocked>, %arg1: tensor<64x16x!tt.ptr, #blocked>, %arg2: i32, %arg3: i32, %arg4: tensor<128x16xf32, #mma>, %arg5: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>, %arg6: i32, %arg7: tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>, %arg8: tensor<128x16xf32, #mma>, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>, %arg11: i32, %arg12: i32, %arg13: tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>) -> tensor<128x16xf32, #mma> { diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp index de562c0e4805..df3bf708a9ed 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp @@ -1,6 +1,7 @@ #include "AsyncUtility.h" #include "Dialect/TritonAMDGPU/IR/Dialect.h" #include "PatternTritonGPUOpToLLVM.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" @@ -13,6 +14,20 @@ using mlir::triton::amdgpu::ISAFamily; using ::mlir::triton::gpu::MemDescType; namespace { + +static LLVM::FenceOp createAMDGPUMemoryFence(OpBuilder &builder, Location loc, + LLVM::AtomicOrdering ordering, + StringRef synchronizeAddrSpace) { + auto fence = + LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup"); + if (!synchronizeAddrSpace.empty()) { + Attribute mmra = builder.getAttr("amdgpu-synchronize-as", + synchronizeAddrSpace); + fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); + } + return fence; +} + class TransLocalLoadOpConversion : public ConvertOpToLLVMPattern { public: @@ -549,21 +564,27 @@ class BarrierOpConversion triton::gpu::AddrSpace::TensorWrite; if ((op.getAddrSpace() & ~mask) != triton::gpu::AddrSpace::None) return failure(); - // We can lower barrier to MemoryCounterWaitOp + s_barrier - // - MemoryCounterWaitOp specifies how many operations to - // VMEM(Read)/VMEM(Write)/LDS can be outstanding when - // the instruction completes. - // - s_barrier synchronizes the execution for the CTA - IntegerAttr zero = rewriter.getI32IntegerAttr(0); bool localBarrier = op.hasLocal(); bool globalBarrier = op.hasGlobalRead() || op.hasGlobalWrite(); if (localBarrier || globalBarrier) { - amdgpu::MemoryCounterWaitOp::create( - rewriter, op->getLoc(), - /* load= */ op.hasGlobalRead() ? zero : nullptr, - /* store= */ op.hasGlobalWrite() ? zero : nullptr, - /* ds= */ localBarrier ? zero : nullptr); + StringRef mmraAddrSpace = ""; + if (localBarrier && !globalBarrier) + mmraAddrSpace = "local"; + else if (!localBarrier && globalBarrier) + mmraAddrSpace = "global"; + + // Local/global barriers use LLVM fences so the AMDGPU memory legalizer + // selects target-specific waits. Mixed local+global barriers are left + // untagged so LLVM conservatively synchronizes every relevant space. + createAMDGPUMemoryFence(rewriter, op->getLoc(), + LLVM::AtomicOrdering::release, mmraAddrSpace); + ROCDL::SBarrierOp::create(rewriter, op->getLoc()); + createAMDGPUMemoryFence(rewriter, op->getLoc(), + LLVM::AtomicOrdering::acquire, mmraAddrSpace); + rewriter.eraseOp(op); + return success(); } + rewriter.replaceOpWithNewOp(op); return success(); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp index be5b0871f9dc..cf75c270326e 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp @@ -1,6 +1,7 @@ #include "TritonAMDGPUTransforms/Passes.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/BuiltinAttributes.h" @@ -27,6 +28,16 @@ namespace mlir { namespace { +static LLVM::FenceOp createLocalMMRAFence(OpBuilder &builder, Location loc, + LLVM::AtomicOrdering ordering) { + Attribute mmra = + builder.getAttr("amdgpu-synchronize-as", "local"); + auto fence = + LLVM::FenceOp::create(builder, loc, ordering, /*syncscope=*/"workgroup"); + fence->setDiscardableAttr(LLVM::LLVMDialect::getMmraAttrName(), mmra); + return fence; +} + // This pass transforms a for-loop calculating a GEMM. Main purpose of the // transform is improve the efficiency of the GPU dot instruction (mfma) // by interleaving the execution of two warps on each SIMD. Especially it groups @@ -721,28 +732,30 @@ LogicalResult Pingponger::transformTwoClusterWithAsyncAndAll(OpBuilder &builder, // Typical `s_xxx` instructions include: // - Control flow: `s_cbranch` // - Priority control: `s_setprio` -// - Synchronization and dependency: `s_waitcnt` +// - Synchronization and dependency: MMRA-tagged local fences, lowered by LLVM +// to target-specific wait instructions. // // These are usually inserted near `s_barrier` boundaries, and the current // implementation carefully places them to ensure they belong to the memory // cluster, improving overall overlap and utilization. // // -// 3. Placement of `s_waitcnt lgkmcnt(0)` -// -------------------------------------- -// We place `s_waitcnt lgkmcnt(0)` at the *end* of the memory cluster to ensure -// that all shared-memory load (`ds_read`) instructions have completed before -// entering the compute cluster. +// 3. Placement of local MMRA fences +// --------------------------------- +// We place local MMRA release fences at the *end* of the memory cluster to +// ensure that all shared-memory load (`ds_read`) instructions have completed +// before entering the compute cluster. LLVM lowers these fences to the +// appropriate target-specific wait instructions. // // This placement prevents the LLVM backend from inserting additional -// `s_waitcnt lgkmcnt()` instructions inside the compute cluster based on +// wait instructions inside the compute cluster based on // inferred dependencies between `mfma` and `ds_read` operations. // // This approach is consistent with the previous design goal: to eliminate all // `s_xxx` instructions from the compute cluster so it can run uninterrupted -// MFMA and VALU operations. Keeping `s_waitcnt lgkmcnt(0)` at the cluster -// boundary enforces data dependency correctness while preserving the clean -// separation between memory and compute phases. +// MFMA and VALU operations. Keeping the local fence at the cluster boundary +// enforces data dependency correctness while preserving the clean separation +// between memory and compute phases. LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder, Location loc) { assert(dotOps.size() == 2); @@ -785,10 +798,10 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder, // Ideally we want the memory cluster to start with // // s_barrier - // s_waitcnt vmcnt(x) lgkmcnt(0) + // local wait // s_setprio 1 // - // However, the membar pass will put s_waitcnt before s_barrier. + // However, the membar path will put the local MMRA fence before s_barrier. // But we can at least put s_setprio in the memory cluster. prependOp(ROCDL::SetPrioOp::create(builder, loc, highPriority), false); @@ -796,19 +809,18 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder, // We want the 2nd compute cluster to start with // // s_setprio 0 - // s_waitcnt lgkmcnt(0) + // local MMRA release fence // s_barrier // // Check note 2 and 3 for details. updateOpInsertion(dotOps[1]); prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false); prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false); - auto dsAttr = builder.getI32IntegerAttr(0); - prependOp(tt::amdgpu::MemoryCounterWaitOp::create( - builder, loc, /* load= */ nullptr, /* store= */ nullptr, - /* ds= */ dsAttr), + prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release), false); prependOp(ROCDL::SBarrierOp::create(builder, loc), false); + prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::acquire), + false); prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false); // MemoryCluster2 @@ -828,7 +840,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder, // stays in the memory cluster. // // s_setprio 0 - // s_waitcnt lgkmcnt(0) + // local MMRA release fence // s_cbranch // s_barrier // @@ -840,9 +852,7 @@ LogicalResult Pingponger::transformChainedDotSchedule(OpBuilder &builder, updateOpInsertion(lastInsertedOp->getBlock()->getTerminator()); prependOp(ROCDL::SchedBarrier::create(builder, loc, 0), false); prependOp(ROCDL::SetPrioOp::create(builder, loc, lowPriority), false); - prependOp(tt::amdgpu::MemoryCounterWaitOp::create( - builder, loc, /* load= */ nullptr, /* store= */ nullptr, - /* ds= */ dsAttr), + prependOp(createLocalMMRAFence(builder, loc, LLVM::AtomicOrdering::release), false); return success();