Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -949,17 +949,11 @@ def TTNG_TMEMSubSliceOp : TTNG_Op<"tmem_subslice", [Pure,
let hasVerifier = 1;
}

def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [
DeclareOpInterfaceMethods<MBarrierOpInterface>]> {
def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy"> {
let summary = "Initiate an asynchronous copy operation from shared memory to the Tensor Memory.";

let description = [{
2D blocks stored contiguously in SMEM are copied into TMEM as specified by the destination address.
The completion of the copy can be observed by waiting on the optional barrier. If this op is used
together with an MMA op, one barrier can be used to wait for both copy and MMA. We do not need to wait
for the completion of the copy before MMA, since tcgen05.cp followed by tcgen05.mma is guaranteed to
execute in that order.

This op lowers to the PTX instruction tcgen05.cp. This supports writing either to scales tmem layout as well as default tmem layout.
Currently the semantic is different when writing to tmem scale layout.

Expand Down Expand Up @@ -1001,11 +995,10 @@ def TTNG_TMEMCopyOp : TTNG_Op<"tmem_copy", [
}];
let arguments = (ins
Arg<TTG_MemDescType, "", [MemRead<SharedMemory>]>:$src,
Arg<TTG_MemDescType, "", [MemWrite<TensorMemory>]>:$dst,
Arg<Optional<TTG_MemDescType>, "", [MemWrite<SharedMemory>]>:$barrier
Arg<TTG_MemDescType, "", [MemWrite<TensorMemory>]>:$dst
);

let assemblyFormat = [{$src `,` $dst (`,` $barrier^)? attr-dict `:` qualified(type(operands))}];
let assemblyFormat = [{$src `,` $dst attr-dict `:` qualified(type(operands))}];
let hasVerifier = 1;
}

Expand Down
2 changes: 1 addition & 1 deletion lib/Analysis/BufferRegion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,7 @@ void BufferRegionAnalysis::calculateUsedBufferRegions(Operation *op) {

bool BufferRegionAnalysis::isMemoryAccessOperation(Operation *op) {
if (isa<ttg::LocalLoadOp, ttg::LocalStoreOp, ttng::TMEMLoadOp,
ttng::TMEMStoreOp, ttg::AsyncCopyGlobalToLocalOp,
ttng::TMEMStoreOp, ttng::TMEMCopyOp, ttg::AsyncCopyGlobalToLocalOp,
ttng::AsyncTMACopyLocalToGlobalOp, ttng::AsyncTMAScatterOp>(op)) {
return true;
}
Expand Down
3 changes: 0 additions & 3 deletions lib/Dialect/TritonInstrument/Transforms/FpSanitizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1722,9 +1722,6 @@ struct TMEMCopyPattern : public OpRewritePattern<ttng::TMEMCopyOp> {
if (!createStoreScratchMemory(rewriter, loc, info->ptr, srcReg, srcRegTy))
return failure();

if (Value barrier = op.getBarrier()) {
ttng::ArriveBarrierOp::create(rewriter, loc, barrier, 1, Value());
}
rewriter.eraseOp(op);
return success();
}
Expand Down
4 changes: 0 additions & 4 deletions lib/Dialect/TritonNvidiaGPU/IR/Ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1287,10 +1287,6 @@ LogicalResult TMEMCopyOp::verify() {
<< srcTy.getShape() << " must match destination shape "
<< dstTy.getShape();

if (getBarrier() && !isa<triton::gpu::SharedMemorySpaceAttr>(
getBarrier().getType().getMemorySpace())) {
return emitOpError("The optional barrier should be a shared memory buffer");
}
if (!getDst().getType().getMutableMemory()) {
return emitOpError("Cannot copy into an immutable alloc");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,6 @@ usesTrackedBarrierInCrossCTAConsumerOp(Operation *op,
if (auto commit = dyn_cast<ttng::TCGen5CommitOp>(op)) {
return ttng::getModuleTwoCTAs(op) && aliasesTracked(commit.getBarrier());
}
if (auto copy = dyn_cast<ttng::TMEMCopyOp>(op)) {
return ttng::getModuleTwoCTAs(op) && aliasesTracked(copy.getBarrier());
}
if (auto tma = dyn_cast<ttng::AsyncTMACopyGlobalToLocalOp>(op)) {
return tma.getMulticast() && aliasesTracked(tma.getBarrier());
}
Expand Down
2 changes: 0 additions & 2 deletions lib/Dialect/TritonNvidiaGPU/Transforms/ConSanNVIDIA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,6 @@ class NVIDIAConSanHooks : public tti::ConSanTargetHooks {
copyOp.getSrc(), "Src");
info->operandEffects.emplace_back(MemEffectsOpInfo::Effects::Write,
copyOp.getDst(), "Dst");
if (copyOp.getBarrier())
info->barriers.push_back({copyOp.getBarrier(), nullptr, 1});
}
if (auto mmav5Op = dyn_cast<ttng::MMAv5OpInterface>(op)) {
info.emplace();
Expand Down
3 changes: 1 addition & 2 deletions lib/Dialect/TritonNvidiaGPU/Transforms/MMALowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ struct TCGen5MMAScaleSharedToTmemConversion
ttg::MemDescType::get(shape, elType, scaleEncoding, tensorMemorySpace,
/*mutableMemory=*/true);
auto tmemAlloc = TMEMAllocOp::create(rewriter, loc, scaleAType, Value());
TMEMCopyOp::create(rewriter, loc, operand.get(), tmemAlloc,
/*barrier*/ Value());
TMEMCopyOp::create(rewriter, loc, operand.get(), tmemAlloc);
operand.set(tmemAlloc);
return true;
}
Expand Down
2 changes: 1 addition & 1 deletion python/src/gluon_ir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -824,7 +824,7 @@ void init_gluon_ir(py::module &&m) {
py::arg("propagateNan") = tt::PropagateNan::NONE)
.def("create_tmem_copy",
[](GluonOpBuilder &self, Value src, Value dst) {
self.create<ttng::TMEMCopyOp>(src, dst, /*barrier=*/Value());
self.create<ttng::TMEMCopyOp>(src, dst);
})
.def("create_tmem_subslice",
[](GluonOpBuilder &self, Type resultTy, Value memDesc,
Expand Down
7 changes: 2 additions & 5 deletions test/Analysis/test-buffer-region.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -208,23 +208,20 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shar
// -----

#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
#smem = #ttg.shared_memory
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1>

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 1 : i32} {
tt.func public @tmem_copy_barrier_regions() {
tt.func public @tmem_copy_regions() {
%src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
%dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
%bar = ttg.local_alloc {allocation.offset = 65536 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable>
// expected-remark @below {{Buffers: [0, 65536]}}
ttng.tmem_copy %src, %dst, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable>
ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
tt.return
}

// expected-remark @below {{All Shared Regions: [0, 65536]}}
// expected-remark @below {{All Tensor Regions: [0, 128]}}
// expected-remark @below {{All Barrier Regions: [65536, 8]}}
tt.func private @print_all_regions() attributes {test.print_all_used_regions} {
tt.return
}
Expand Down
14 changes: 6 additions & 8 deletions test/Conversion/tritongpu_to_llvm_blackwell.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -409,15 +409,14 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, "ttg.thr

// CHECK-LABEL: @tmem_copy_2d
tt.func public @tmem_copy_2d(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>,
%dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>,
%barrier: !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>) {
%dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>) {
// CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32)
// CHECK: [[IS_WARP_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]] : i32
// CHECK: [[ELECT:%.*]] = nvvm.elect.sync
// CHECK: [[WARP_PRED:%.*]] = llvm.and [[IS_WARP_0]], [[ELECT]]
// CHECK-COUNT-8: tcgen05.cp.cta_group::1.warpx4.32x128b
// CHECK: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [ $0 + 0 ];", "r,b" {{.*}}, [[WARP_PRED]]
ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>
// CHECK-NOT: tcgen05.commit
ttng.tmem_copy %src, %dst : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>
tt.return
}

Expand Down Expand Up @@ -461,8 +460,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 2 : i32, "ttg.thr

// CHECK-LABEL: @tmem_copy_2d_2cta
tt.func public @tmem_copy_2d_2cta(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>,
%dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>,
%barrier: !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>) {
%dst: !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>) {
// CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32)
// CHECK: [[IS_WARP_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]] : i32
// CHECK: [[ELECT:%.*]] = nvvm.elect.sync
Expand All @@ -472,8 +470,8 @@ tt.func public @tmem_copy_2d_2cta(%src: !ttg.memdesc<128x32xi8, #shared, #ttg.sh
// CHECK: [[IS_CLUSTER_0:%.*]] = llvm.icmp "eq" {{.*}}, [[ZERO]]
// CHECK: [[LEAD_PRED:%.*]] = llvm.and [[WARP_PRED]], [[IS_CLUSTER_0]]
// CHECK-COUNT-8: tcgen05.cp.cta_group::2.warpx4.32x128b
// CHECK: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [ $0 + 0 ];", "r,b" {{.*}}, [[LEAD_PRED]]
ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #ttg.shared_memory>
// CHECK-NOT: tcgen05.commit
ttng.tmem_copy %src, %dst : !ttg.memdesc<128x32xi8, #shared, #ttg.shared_memory>, !ttg.memdesc<128x32xi8, #tmem_scales, #ttng.tensor_memory, mutable>
tt.return
}
}
Expand Down
12 changes: 4 additions & 8 deletions test/TritonGPU/consan.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -807,20 +807,16 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.shar
// -----

#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32, CGALayout = [[0, 0]]}>
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], CGALayout = [[0]]}>
#smem = #ttg.shared_memory
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, colStride = 1, CGALayout = [[0, 0]]>
module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 1 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:90", ttg.tensor_memory_size = 0 : i32, "ttg.threads-per-warp" = 32 : i32, "ttg.total-num-warps" = 1 : i32, "ttng.two-ctas" = true} {
// CHECK-LABEL: @tmem_copy_2cta_barrier
tt.func public @tmem_copy_2cta_barrier() {
// CHECK-LABEL: @tmem_copy_2cta
tt.func public @tmem_copy_2cta() {
%src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
%dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
%bar = ttg.local_alloc {allocation.offset = 65536 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable>
ttng.init_barrier %bar, 1 : !ttg.memdesc<1xi64, #shared1, #smem, mutable>
// CHECK: arith.constant 3 : i32
// CHECK: tt.call @__triton_consan_verify_barrier_arrive
// CHECK: tt.call @__triton_consan_update_barrier_state
ttng.tmem_copy %src, %dst, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable>
// CHECK: ttng.tmem_copy
ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
tt.return
}
}
Expand Down
4 changes: 1 addition & 3 deletions test/TritonGPU/nvidia-fpsan.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shar
// CHECK-LABEL: @tmem_copy_subslice
tt.func public @tmem_copy_subslice() {
// CHECK: ttg.global_scratch_alloc
// CHECK: ttng.arrive_barrier
// CHECK-NOT: ttng.tmem_copy
// CHECK-NOT: ttng.tmem_subslice
%src = ttg.local_alloc {allocation.offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
%dst = ttng.tmem_alloc {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
%sub = ttng.tmem_subslice %dst {N = 0 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
%bar = ttg.local_alloc {allocation.offset = 4096 : i32} : () -> !ttg.memdesc<1xi64, #shared1, #smem, mutable>
ttng.tmem_copy %src, %sub, %bar : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #shared1, #smem, mutable>
ttng.tmem_copy %src, %sub : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
tt.return
}
}
Expand Down
9 changes: 5 additions & 4 deletions test/TritonNvidiaGPU/membar-cluster.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -666,18 +666,19 @@ module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, "ttng.tw
#smem = #ttg.shared_memory

module attributes {"ttg.num-ctas" = 2 : i32, "ttg.num-warps" = 4 : i32, "ttng.two-ctas" = true, ttg.target = "cuda:100", "ttg.threads-per-warp" = 32 : i32} {
// CHECK-LABEL: @insert_fence_and_relaxed_cluster_barrier_before_tmem_copy
// CHECK-LABEL: @insert_fence_and_relaxed_cluster_barrier_before_wait_after_tmem_copy
// CHECK: ttng.init_barrier
// CHECK: ttng.tmem_copy
// CHECK-NEXT: ttng.fence_mbarrier_init_release_cluster
// CHECK-NEXT: ttng.cluster_barrier {relaxed = true}
// CHECK: ttng.tmem_copy
tt.func @insert_fence_and_relaxed_cluster_barrier_before_tmem_copy() {
// CHECK-NEXT: ttng.wait_barrier
tt.func @insert_fence_and_relaxed_cluster_barrier_before_wait_after_tmem_copy() {
%c0 = arith.constant 0 : i32
%src = ttg.local_alloc : () -> !ttg.memdesc<128x128xf32, #shared, #smem, mutable>
%dst = ttng.tmem_alloc : () -> !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
%barrier = ttg.local_alloc : () -> !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable>
ttng.init_barrier %barrier, 1 : !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable>
ttng.tmem_copy %src, %dst, %barrier : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>, !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable>
ttng.tmem_copy %src, %dst : !ttg.memdesc<128x128xf32, #shared, #smem, mutable>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory, mutable>
ttng.wait_barrier %barrier, %c0 : !ttg.memdesc<1xi64, #barrierEnc, #smem, mutable>
tt.return
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -666,18 +666,6 @@ struct TensorMemoryAllocOpConversion
}
};

static void createCommit(ConversionPatternRewriter &rewriter, Location loc,
Value barrier, Value pred, bool twoCTAs) {
PTXBuilder ptxBuilder;
auto *barrierOperand = ptxBuilder.newAddrOperand(barrier, "r");
std::string opcode =
"tcgen05.commit.cta_group::" + std::to_string(twoCTAs ? 2 : 1) +
".mbarrier::arrive::one.shared::cluster.b64";
auto &barrierOp = *ptxBuilder.create(opcode);
barrierOp(barrierOperand).predicate(pred);
ptxBuilder.launch(rewriter, loc, void_ty(rewriter.getContext()));
}

static void createTcgen05Cp(ConversionPatternRewriter &rewriter, Location loc,
Value tmem_address, Value src_desc, Value pred,
TMemCopyAtom atom, bool twoCTAs) {
Expand Down Expand Up @@ -790,12 +778,6 @@ struct TensorMemoryCopyOpConversion
adaptor.getSrc(), adaptor.getDst(), pred)))
return failure();

if (op.getBarrier()) {
auto barrier = LLVM::getSharedMemoryObjectFromStruct(
op.getLoc(), adaptor.getBarrier(), i64_ty, rewriter);
createCommit(rewriter, loc, barrier.getBase(), pred, twoCTAs);
}

rewriter.eraseOp(op);
return success();
}
Expand Down
Loading