diff --git a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td index e6d8e2effecb..341ec8c70d51 100644 --- a/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td +++ b/include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td @@ -949,17 +949,11 @@ def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure, let hasVerifier = 1; } -def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [ - DeclareOpInterfaceMethods]> { +def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy"> { let summary = "Initiate an asynchronous copy operation from shared memory to the Tensor Memory."; let description = [{ 2D blocks stored contiguously in SMEM are copied into TMEM as specified by the destination address. - The completion of the copy can be observed by waiting on the optional barrier. If this op is used - together with an MMA op, one barrier can be used to wait for both copy and MMA. We do not need to wait - for the completion of the copy before MMA, since tcgen05.cp followed by tcgen05.mma is guaranteed to - execute in that order. - This op lowers to the PTX instruction tcgen05.cp. This supports writing either to scales tmem layout as well as default tmem layout. Currently the semantic is different when writing to tmem scale layout. @@ -1001,11 +995,10 @@ def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [ }]; let arguments = (ins Arg]>:$src, - Arg]>:$dst, - Arg, "", [MemWrite]>:$barrier + Arg]>:$dst ); - let assemblyFormat = [{$src `,` $dst (`,` $barrier^)? attr-dict `:` qualified(type(operands))}]; + let assemblyFormat = [{$src `,` $dst attr-dict `:` qualified(type(operands))}]; let hasVerifier = 1; } diff --git a/lib/Analysis/BufferRegion.cpp b/lib/Analysis/BufferRegion.cpp index 03fe8df1094f..040ef6800272 100644 --- a/lib/Analysis/BufferRegion.cpp +++ b/lib/Analysis/BufferRegion.cpp @@ -311,7 +311,7 @@ void BufferRegionAnalysis::calculateUsedBufferRegions(Operation *op) { bool BufferRegionAnalysis::isMemoryAccessOperation(Operation *op) { if (isa(op)) { return true; } diff --git a/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp b/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp index b4ff772b3562..c9068da03e12 100644 --- a/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp +++ b/lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp @@ -1722,9 +1722,6 @@ struct TMEMCopyPattern : public OpRewritePattern { if (!createStoreScratchMemory(rewriter, loc, info->ptr, srcReg, srcRegTy)) return failure(); - if (Value barrier = op.getBarrier()) { - ttng::ArriveBarrierOp::create(rewriter, loc, barrier, 1, Value()); - } rewriter.eraseOp(op); return success(); } diff --git a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp index c8777a801d88..55a31beb0ba0 100644 --- a/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp +++ b/lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp @@ -1287,10 +1287,6 @@ LogicalResult TMEMCopyOp::verify() { << srcTy.getShape() << " must match destination shape " << dstTy.getShape(); - if (getBarrier() && !isa( - getBarrier().getType().getMemorySpace())) { - return emitOpError("The optional barrier should be a shared memory buffer"); - } if (!getDst().getType().getMutableMemory()) { return emitOpError("Cannot copy into an immutable alloc"); } diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp index 2c7be4535ea0..58c5b734ac6f 100644 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/ClusterBarrierInsertion.cpp @@ -106,9 +106,6 @@ usesTrackedBarrierInCrossCTAConsumerOp(Operation *op, if (auto commit = dyn_cast(op)) { return ttng::getModuleTwoCTAs(op) && aliasesTracked(commit.getBarrier()); } - if (auto copy = dyn_cast(op)) { - return ttng::getModuleTwoCTAs(op) && aliasesTracked(copy.getBarrier()); - } if (auto tma = dyn_cast(op)) { return tma.getMulticast() && aliasesTracked(tma.getBarrier()); } diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/ConSanNVIDIA.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/ConSanNVIDIA.cpp index eccd4a49e9c3..dcf17d63115f 100644 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/ConSanNVIDIA.cpp +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/ConSanNVIDIA.cpp @@ -154,8 +154,6 @@ class NVIDIAConSanHooks : public tti::ConSanTargetHooks { copyOp.getSrc(), "Src"); info->operandEffects.emplace_back(MemEffectsOpInfo::Effects::Write, copyOp.getDst(), "Dst"); - if (copyOp.getBarrier()) - info->barriers.push_back({copyOp.getBarrier(), nullptr, 1}); } if (auto mmav5Op = dyn_cast(op)) { info.emplace(); diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp index ad8c471bfd6b..3d1f57a4dee4 100644 --- a/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp +++ b/lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp @@ -75,8 +75,7 @@ struct TCGen5MMAScaleSharedToTmemConversion ttg::MemDescType::get(shape, elType, scaleEncoding, tensorMemorySpace, /*mutableMemory=*/true); auto tmemAlloc = TMEMAllocOp::create(rewriter, loc, scaleAType, Value()); - TMEMCopyOp::create(rewriter, loc, operand.get(), tmemAlloc, - /*barrier*/ Value()); + TMEMCopyOp::create(rewriter, loc, operand.get(), tmemAlloc); operand.set(tmemAlloc); return true; } diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc index da66a9277f24..7bbfc0d2cae0 100644 --- a/python/src/gluon_ir.cc +++ b/python/src/gluon_ir.cc @@ -824,7 +824,7 @@ void init_gluon_ir(py::module &&m) { py::arg("propagateNan") = tt::PropagateNan::NONE) .def("create_tmem_copy", [](GluonOpBuilder &self, Value src, Value dst) { - self.create(src, dst, /*barrier=*/Value()); + self.create(src, dst); }) .def("create_tmem_subslice", [](GluonOpBuilder &self, Type resultTy, Value memDesc, diff --git a/test/Analysis/test-buffer-region.mlir b/test/Analysis/test-buffer-region.mlir index 469347912171..d7b6e72aa6dc 100644 --- a/test/Analysis/test-buffer-region.mlir +++ b/test/Analysis/test-buffer-region.mlir @@ -208,23 +208,20 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shar // ----- #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}> -#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}> #smem = #ttg.shared_memory #tmem = #ttng.tensor_memory_encoding module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 1 : i32} { - tt.func public @tmem_copy_barrier_regions() { + tt.func public @tmem_copy_regions() { %src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable> %dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> - %bar = ttg.local_alloc {allocation.offset = 65536 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable> // expected-remark @below {{Buffers: [0, 65536]}} - ttng.tmem_copy %src, %dst, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable> + ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> tt.return } // expected-remark @below {{All Shared Regions: [0, 65536]}} // expected-remark @below {{All Tensor Regions: [0, 128]}} - // expected-remark @below {{All Barrier Regions: [65536, 8]}} tt.func private @print_all_regions() attributes {test.print_all_used_regions} { tt.return } diff --git a/test/Conversion/tritongpu_to_llvm_blackwell.mlir b/test/Conversion/tritongpu_to_llvm_blackwell.mlir index 69070e06ec50..95be9cb20c57 100644 --- a/test/Conversion/tritongpu_to_llvm_blackwell.mlir +++ b/test/Conversion/tritongpu_to_llvm_blackwell.mlir @@ -409,15 +409,14 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.thr // CHECK-LABEL: @tmem_copy_2d tt.func public @tmem_copy_2d(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, - %dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, - %barrier: !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>) { + %dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>) { // CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) // CHECK: [[IS_WARP_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]] : i32 // CHECK: [[ELECT:%.*]] = nvvm.elect.sync // CHECK: [[WARP_PRED:%.*]] = llvm.and [[IS_WARP_0]], [[ELECT]] // CHECK-COUNT-8: tcgen05.cp.cta_group::1.warpx4.32x128b - // CHECK: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [ $0 + 0 ];", "r,b" {{.*}}, [[WARP_PRED]] - ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory> + // CHECK-NOT: tcgen05.commit + ttng.tmem_copy %src, %dst : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable> tt.return } @@ -461,8 +460,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 2 : i32, "ttg.thr // CHECK-LABEL: @tmem_copy_2d_2cta tt.func public @tmem_copy_2d_2cta(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, - %dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, - %barrier: !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>) { + %dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>) { // CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) // CHECK: [[IS_WARP_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]] : i32 // CHECK: [[ELECT:%.*]] = nvvm.elect.sync @@ -472,8 +470,8 @@ tt.func public @tmem_copy_2d_2cta(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.sh // CHECK: [[IS_CLUSTER_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]] // CHECK: [[LEAD_PRED:%.*]] = llvm.and [[WARP_PRED]], [[IS_CLUSTER_0]] // CHECK-COUNT-8: tcgen05.cp.cta_group::2.warpx4.32x128b - // CHECK: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [ $0 + 0 ];", "r,b" {{.*}}, [[LEAD_PRED]] - ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory> + // CHECK-NOT: tcgen05.commit + ttng.tmem_copy %src, %dst : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable> tt.return } } diff --git a/test/TritonGPU/consan.mlir b/test/TritonGPU/consan.mlir index 9ef0928b574c..85e61e99bca5 100644 --- a/test/TritonGPU/consan.mlir +++ b/test/TritonGPU/consan.mlir @@ -807,20 +807,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shar // ----- #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32, CGALayout = [[0, 0]]}> -#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CGALayout = [[0]]}> #smem = #ttg.shared_memory #tmem = #ttng.tensor_memory_encoding module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:90", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 1 : i32, "ttng.two-ctas" = true} { - // CHECK-LABEL: @tmem_copy_2cta_barrier - tt.func public @tmem_copy_2cta_barrier() { + // CHECK-LABEL: @tmem_copy_2cta + tt.func public @tmem_copy_2cta() { %src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable> %dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> - %bar = ttg.local_alloc {allocation.offset = 65536 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable> - ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable> // CHECK: arith.constant 3 : i32 - // CHECK: tt.call @__triton_consan_verify_barrier_arrive - // CHECK: tt.call @__triton_consan_update_barrier_state - ttng.tmem_copy %src, %dst, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable> + // CHECK: ttng.tmem_copy + ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> tt.return } } diff --git a/test/TritonGPU/nvidia-fpsan.mlir b/test/TritonGPU/nvidia-fpsan.mlir index 633afff9fbbf..4f5ac4382d7d 100644 --- a/test/TritonGPU/nvidia-fpsan.mlir +++ b/test/TritonGPU/nvidia-fpsan.mlir @@ -33,14 +33,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar // CHECK-LABEL: @tmem_copy_subslice tt.func public @tmem_copy_subslice() { // CHECK: ttg.global_scratch_alloc - // CHECK: ttng.arrive_barrier // CHECK-NOT: ttng.tmem_copy // CHECK-NOT: ttng.tmem_subslice %src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable> %dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> %sub = ttng.tmem_subslice %dst {N = 0 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> - %bar = ttg.local_alloc {allocation.offset = 4096 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable> - ttng.tmem_copy %src, %sub, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable> + ttng.tmem_copy %src, %sub : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> tt.return } } diff --git a/test/TritonNvidiaGPU/membar-cluster.mlir b/test/TritonNvidiaGPU/membar-cluster.mlir index 28259ea2a430..becb904b91ee 100644 --- a/test/TritonNvidiaGPU/membar-cluster.mlir +++ b/test/TritonNvidiaGPU/membar-cluster.mlir @@ -666,18 +666,19 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, "ttng.tw #smem = #ttg.shared_memory module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, "ttng.two-ctas" = true, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} { - // CHECK-LABEL: @insert_fence_and_relaxed_cluster_barrier_before_tmem_copy + // CHECK-LABEL: @insert_fence_and_relaxed_cluster_barrier_before_wait_after_tmem_copy // CHECK: ttng.init_barrier + // CHECK: ttng.tmem_copy // CHECK-NEXT: ttng.fence_mbarrier_init_release_cluster // CHECK-NEXT: ttng.cluster_barrier {relaxed = true} - // CHECK: ttng.tmem_copy - tt.func @insert_fence_and_relaxed_cluster_barrier_before_tmem_copy() { + // CHECK-NEXT: ttng.wait_barrier + tt.func @insert_fence_and_relaxed_cluster_barrier_before_wait_after_tmem_copy() { %c0 = arith.constant 0 : i32 %src = ttg.local_alloc : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable> %dst = ttng.tmem_alloc : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> %barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable> ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable> - ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable> + ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable> tt.return } diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp index 4581767444f5..16cfa590fd21 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TensorMemoryToLLVM.cpp @@ -666,18 +666,6 @@ struct TensorMemoryAllocOpConversion } }; -static void createCommit(ConversionPatternRewriter &rewriter, Location loc, - Value barrier, Value pred, bool twoCTAs) { - PTXBuilder ptxBuilder; - auto *barrierOperand = ptxBuilder.newAddrOperand(barrier, "r"); - std::string opcode = - "tcgen05.commit.cta_group::" + std::to_string(twoCTAs ? 2 : 1) + - ".mbarrier::arrive::one.shared::cluster.b64"; - auto &barrierOp = *ptxBuilder.create(opcode); - barrierOp(barrierOperand).predicate(pred); - ptxBuilder.launch(rewriter, loc, void_ty(rewriter.getContext())); -} - static void createTcgen05Cp(ConversionPatternRewriter &rewriter, Location loc, Value tmem_address, Value src_desc, Value pred, TMemCopyAtom atom, bool twoCTAs) { @@ -790,12 +778,6 @@ struct TensorMemoryCopyOpConversion adaptor.getSrc(), adaptor.getDst(), pred))) return failure(); - if (op.getBarrier()) { - auto barrier = LLVM::getSharedMemoryObjectFromStruct( - op.getLoc(), adaptor.getBarrier(), i64_ty, rewriter); - createCommit(rewriter, loc, barrier.getBase(), pred, twoCTAs); - } - rewriter.eraseOp(op); return success(); }