From dc94f697856dd4c11afac39f3830a93ae9d65cfe Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 21 Nov 2024 01:39:16 +0000 Subject: [PATCH 1/8] Add coalesce async copy pass --- .../Dialect/TritonGPU/Transforms/Passes.td | 12 ++ .../TritonGPU/Transforms/CMakeLists.txt | 1 + .../Transforms/CoalesceAsyncCopy.cpp | 115 ++++++++++++++++++ python/src/passes.cc | 2 + test/TritonGPU/coalesce-async-copy.mlir | 20 +++ third_party/nvidia/backend/compiler.py | 1 + 6 files changed, 151 insertions(+) create mode 100644 lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp create mode 100644 test/TritonGPU/coalesce-async-copy.mlir diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td index 0639df714c60..dec9271f3515 100644 --- a/include/triton/Dialect/TritonGPU/Transforms/Passes.td +++ b/include/triton/Dialect/TritonGPU/Transforms/Passes.td @@ -192,4 +192,16 @@ def TritonGPULoopScheduling: Pass<"tritongpu-loop-scheduling", "mlir::ModuleOp"> "number of pipeline stages"> ]; } + +def TritonGPUCoalesceAsyncCopy: Pass<"tritongpu-coalesce-async-copy", "mlir::ModuleOp"> { + let summary = "Improve coalescing for async global to local copies"; + + let description = "For AsyncCopyGlobalToLocal ops where the shared encoding's vec is less than " + "the blocked encoding's sizePerThread, this pass improves coalescing by clipping the " + "sizePerThread value"; + + let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect", + "mlir::triton::TritonDialect"]; +} + #endif diff --git a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt index ef4cec328f86..740014b77948 100644 --- a/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt +++ b/lib/Dialect/TritonGPU/Transforms/CMakeLists.txt @@ -18,6 +18,7 @@ add_triton_library(TritonGPUTransforms Prefetch.cpp RemoveLayoutConversions.cpp ReorderInstructions.cpp + CoalesceAsyncCopy.cpp Utility.cpp DEPENDS diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp new file mode 100644 index 000000000000..05d65ff78d30 --- /dev/null +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -0,0 +1,115 @@ +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/Passes.h" +#include "triton/Analysis/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/Transforms/Passes.h" +#include "triton/Dialect/TritonGPU/Transforms/Utility.h" + +#include + +namespace tt = mlir::triton; + +namespace mlir { +namespace triton { +namespace gpu { + +#define GEN_PASS_DEF_TRITONGPUCOALESCEASYNCCOPY +#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc" + +// This pass currently only applies if the following are all true... +// 1) Operand A for WGMMA is to be loaded in registers +// 2) We upcast operand A in registers before the WGMMA +// (downcasting is not yet supported) +// 3) Pipelining is enabled for loading A +// +// ...then for the AsyncCopyGlobalToLocal op, the SharedEncoding +// vec will be less than BlockedEncoding's sizePerThread for k-dim. E.g. if +// we're upcasting from int8 to bf16, then shared vec is 8 and sizePerThread +// for k is 16. In this case, AsyncCopyGlobalToLocal will generate two +// 8-byte-cp.async's for each contiguous 16B global data owned by each +// thread. This breaks coalescing (i.e. results 2x the minimum required +// transactions). +// +// This issue occurs for cp.async because it combines load and store into one +// instruction. The fix is to clip each dim of sizePerThread by shared vec, so +// that the vectorization of load and store are equal along the contiguous +// dimension. In the above example, each thread will then only own 8B contiguous +// global data. +struct ClipAsyncCopySizePerThread + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(AsyncCopyGlobalToLocalOp copyOp, + PatternRewriter &rewriter) const override { + Value src = copyOp.getSrc(); + Value mask = copyOp.getMask(); + Value other = copyOp.getOther(); + + auto inputTy = cast(src.getType()); + auto blockEnc = cast(inputTy.getEncoding()); + auto resultTy = cast(copyOp.getResult().getType()); + auto sharedEnc = cast(resultTy.getEncoding()); + auto sharedVec = sharedEnc.getVec(); + + // clip each dim of sizePerThread by its respective dim in vec + SmallVector newSizePerThread; + llvm::transform(blockEnc.getSizePerThread(), + std::back_inserter(newSizePerThread), + [&](auto size) { return std::min(size, sharedVec); }); + + if (newSizePerThread == blockEnc.getSizePerThread()) + return rewriter.notifyMatchFailure(copyOp, + "at least one dimension of blocked sizePerThread must be greater than shared vec"); + + // obtain new blockedEnc based on clipped sizePerThread + auto mod = copyOp->getParentOfType(); + int numWarps = TritonGPUDialect::getNumWarps(mod); + int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod); + auto newBlockEnc = BlockedEncodingAttr::get( + copyOp.getContext(), inputTy.getShape(), newSizePerThread, + blockEnc.getOrder(), numWarps, threadsPerWarp, + blockEnc.getCTALayout()); + + // insert cvt's after src, mask, and other + auto convertBlockLayout = [&](Value src, BlockedEncodingAttr enc) { + auto ty = cast(src.getType()); + auto newTy = + RankedTensorType::get(ty.getShape(), ty.getElementType(), enc); + auto cvt = rewriter.create(copyOp->getLoc(), newTy, src); + return cvt.getResult(); + }; + src = convertBlockLayout(src, newBlockEnc); + if (mask) + mask = convertBlockLayout(mask, newBlockEnc); + if (other) + other = convertBlockLayout(other, newBlockEnc); + + // replace the asyncCopy + auto newCopyOp = rewriter.create( + copyOp.getLoc(), src, copyOp.getResult(), mask, other, + copyOp.getCache(), copyOp.getEvict(), copyOp.getIsVolatile()); + rewriter.replaceOp(copyOp, newCopyOp); + + return success(); + } +}; + +class CoalesceAsyncCopyPass + : public impl::TritonGPUCoalesceAsyncCopyBase< + CoalesceAsyncCopyPass> { +public: + void runOnOperation() override { + ModuleOp m = getOperation(); + MLIRContext *context = &getContext(); + + mlir::RewritePatternSet patterns(context); + patterns.add(context); + + if (failed(applyPatternsAndFoldGreedily(m, std::move(patterns)))) + signalPassFailure(); + } +}; + +} // namespace gpu +} // namespace triton +} // namespace mlir diff --git a/python/src/passes.cc b/python/src/passes.cc index d6612387b286..235eba4465cb 100644 --- a/python/src/passes.cc +++ b/python/src/passes.cc @@ -72,6 +72,8 @@ void init_triton_passes_ttgpuir(py::module &&m) { createTritonGPUOptimizeAccumulatorInit); ADD_PASS_OPTION_WRAPPER_1("add_loop_scheduling", createTritonGPULoopScheduling, int); + ADD_PASS_WRAPPER_0("add_coalesce_async_copy", + createTritonGPUCoalesceAsyncCopy); } void init_triton_passes_convert(py::module &&m) { diff --git a/test/TritonGPU/coalesce-async-copy.mlir b/test/TritonGPU/coalesce-async-copy.mlir new file mode 100644 index 000000000000..6da63d6e341a --- /dev/null +++ b/test/TritonGPU/coalesce-async-copy.mlir @@ -0,0 +1,20 @@ +// RUN: triton-opt %s -split-input-file -tritongpu-coalesce-async-copy | FileCheck %s + +// CHECK: #[[NEW_BLOCKED:.*]] = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +// CHECK: %{{.*}} = triton_gpu.convert_layout %{{.*}} : {{.*}} -> tensor<64x16x!tt.ptr, #[[NEW_BLOCKED]]> +// CHECK: %{{.*}} = triton_gpu.convert_layout %{{.*}} : {{.*}} -> tensor<64x16xi1, #[[NEW_BLOCKED]]> +// CHECK: %{{.*}} = triton_gpu.convert_layout %{{.*}} : {{.*}} -> tensor<64x16xi8, #[[NEW_BLOCKED]]> +// CHECK: %{{.*}} = triton_gpu.async_copy_global_to_local %{{.*}}: tensor<64x16x!tt.ptr, #[[NEW_BLOCKED]]> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +tt.func @async_copy_i8(%input: tensor<64x16x!tt.ptr, #blocked>, + %view: !tt.memdesc<64x16xi8, #shared, #triton_gpu.shared_memory, mutable>, + %mask: tensor<64x16xi1, #blocked>, + %other: tensor<64x16xi8, #blocked>) { + %token = triton_gpu.async_copy_global_to_local %input, %view mask %mask other %other: tensor<64x16x!tt.ptr, #blocked> -> <64x16xi8, #shared, #triton_gpu.shared_memory, mutable> + tt.return +} +} + diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py index 6d6d70fc87e3..233c11938fda 100644 --- a/third_party/nvidia/backend/compiler.py +++ b/third_party/nvidia/backend/compiler.py @@ -234,6 +234,7 @@ def make_ttgir(mod, metadata, opt, capability): passes.ttgpuir.add_pipeline(pm, opt.num_stages) passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, capability >= 80) + passes.ttgpuir.add_coalesce_async_copy(pm) passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) From 14858e4390f17ce0334222128b005af6e6a78b00 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 21 Nov 2024 21:32:38 +0000 Subject: [PATCH 2/8] Make logic more general --- .../Conversion/TritonGPUToLLVM/Utility.h | 4 ++ lib/Conversion/TritonGPUToLLVM/Utility.cpp | 64 +++++++++++-------- .../Transforms/CoalesceAsyncCopy.cpp | 36 +++++++---- 3 files changed, 65 insertions(+), 39 deletions(-) diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index ba24461a1f6d..52372dd0effa 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -1129,6 +1129,10 @@ SmallVector> emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, Attribute layout, RankedTensorType type, bool withCTAOffset); +// TODO document +std::optional getRegToSharedLayout(MLIRContext* ctx, + ArrayRef shape, Attribute srcEnc, Attribute dstEnc, int elemBitWidth); + // Emits IR to load data from shared memory into registers, or to store data // from registers into shared memory. // diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index c681cd344ce8..fe2441713943 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -158,36 +158,25 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, return ret; } -bool emitTransferBetweenRegistersAndShared( - RankedTensorType registerTy, triton::gpu::MemDescType sharedTy, - Type elemLlvmTy, std::optional maxVecElems, Value shmemBase, - ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, - const TargetInfoBase &target, - std::function perVectorCallback) { - MLIRContext *ctx = rewriter.getContext(); - - auto shape = registerTy.getShape(); - int rank = shape.size(); - +std::optional getRegToSharedLayout(MLIRContext* ctx, + ArrayRef shape, Attribute srcEnc, Attribute dstEnc, int elemBitWidth) { StringAttr kBlock = str_attr("block"); - StringAttr kRegister = str_attr("register"); - StringAttr kLane = str_attr("lane"); - StringAttr kWarp = str_attr("warp"); + int rank = shape.size(); std::optional regLayout = - triton::gpu::toLinearLayout(shape, registerTy.getEncoding()); + triton::gpu::toLinearLayout(shape, srcEnc); std::optional sharedLayout = triton::gpu::toLinearLayout( - shape, sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth()); + shape, dstEnc, elemBitWidth); if (!regLayout.has_value() || !sharedLayout.has_value()) { - return false; + return std::nullopt; } - auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding()); + auto sharedOrder = triton::gpu::getOrder(dstEnc); // sharedLayout's in-dims are currently (offset, block). Reshape to // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional // shmem strides. (The offsetX's appear in minor-to-major order.) auto sharedLegacy = - cast(sharedTy.getEncoding()); + cast(dstEnc); SmallVector> multiDimSharedSize; for (int i = 0; i < rank; i++) { int dim = sharedOrder[i]; @@ -202,13 +191,35 @@ bool emitTransferBetweenRegistersAndShared( // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1, // ..., offsetXN, block), where the offsetX's are in minor-to-major order. - LinearLayout regToSharedLayout = regLayout->invertAndCompose(*sharedLayout); + return regLayout->invertAndCompose(*sharedLayout); +} + +bool emitTransferBetweenRegistersAndShared( + RankedTensorType registerTy, triton::gpu::MemDescType sharedTy, + Type elemLlvmTy, std::optional maxVecElems, Value shmemBase, + ArrayRef shmemStrides, Location loc, RewriterBase &rewriter, + const TargetInfoBase &target, + std::function perVectorCallback) { + MLIRContext *ctx = rewriter.getContext(); + + auto shape = registerTy.getShape(); + int rank = shape.size(); + + StringAttr kBlock = str_attr("block"); + StringAttr kRegister = str_attr("register"); + StringAttr kLane = str_attr("lane"); + StringAttr kWarp = str_attr("warp"); + + auto regToSharedLayout = getRegToSharedLayout(ctx, shape, registerTy.getEncoding(), + sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth()); + if (!regToSharedLayout.has_value()) + return false; // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. - for (int inBlock = 1; inBlock < regToSharedLayout.getInDimSize(kBlock); + for (int inBlock = 1; inBlock < regToSharedLayout->getInDimSize(kBlock); inBlock *= 2) { - auto idx = llvm::to_vector(llvm::make_second_range(regToSharedLayout.apply( + auto idx = llvm::to_vector(llvm::make_second_range(regToSharedLayout->apply( {{kRegister, 0}, {kLane, 0}, {kWarp, 0}, {kBlock, inBlock}}))); // offsetX1, ..., offsetXN must all be 0. if (!llvm::all_of(ArrayRef(idx).drop_back(1), @@ -234,15 +245,15 @@ bool emitTransferBetweenRegistersAndShared( // which have known strides. This would allow us to vectorize across multiple // shmem out dimensions where possible. const int vecElems = - std::min(regToSharedLayout.getNumConsecutiveInOut(), + std::min(regToSharedLayout->getNumConsecutiveInOut(), maxVecElems.value_or(std::numeric_limits::max())); Value threadId = getThreadId(rewriter, loc); - Value threadsPerWarp = i32_val(regToSharedLayout.getInDimSize(kLane)); + Value threadsPerWarp = i32_val(regToSharedLayout->getInDimSize(kLane)); Value laneId = urem(threadId, threadsPerWarp); Value warpId = udiv(threadId, threadsPerWarp); - int numElems = regToSharedLayout.getInDimSize(kRegister); + int numElems = regToSharedLayout->getInDimSize(kRegister); auto vecTy = vec_ty(elemLlvmTy, vecElems); auto ptrTy = shmemBase.getType(); Value zero = i32_val(0); @@ -253,7 +264,7 @@ bool emitTransferBetweenRegistersAndShared( // we drop_end to drop block, which we know from above will be 0. auto multiDimShmemOffset = llvm::to_vector(llvm::drop_end(llvm::make_second_range( - applyLinearLayout(loc, rewriter, regToSharedLayout, + applyLinearLayout(loc, rewriter, *regToSharedLayout, {{kRegister, i32_val(i * vecElems)}, {kLane, laneId}, {kWarp, warpId}, @@ -261,6 +272,7 @@ bool emitTransferBetweenRegistersAndShared( // Reorder strides according to `order`. This way they match the // multi-dimensional offsets in regToSharedLayout. + auto sharedOrder = triton::gpu::getOrder(sharedTy.getEncoding()); Value shmemOffset = dot(rewriter, loc, multiDimShmemOffset, applyPermutation(shmemStrides, sharedOrder)); auto vecAddr = gep(ptrTy, elemLlvmTy, shmemBase, shmemOffset); diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index 05d65ff78d30..cf0ffca18982 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -4,6 +4,7 @@ #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include @@ -44,29 +45,38 @@ struct ClipAsyncCopySizePerThread Value src = copyOp.getSrc(); Value mask = copyOp.getMask(); Value other = copyOp.getOther(); - - auto inputTy = cast(src.getType()); - auto blockEnc = cast(inputTy.getEncoding()); - auto resultTy = cast(copyOp.getResult().getType()); - auto sharedEnc = cast(resultTy.getEncoding()); + auto srcTy = cast(src.getType()); + auto blockEnc = cast(srcTy.getEncoding()); + auto dstTy = cast(copyOp.getResult().getType()); + auto sharedEnc = cast(dstTy.getEncoding()); auto sharedVec = sharedEnc.getVec(); - // clip each dim of sizePerThread by its respective dim in vec - SmallVector newSizePerThread; - llvm::transform(blockEnc.getSizePerThread(), - std::back_inserter(newSizePerThread), - [&](auto size) { return std::min(size, sharedVec); }); + // obtain max contiguous copy size + // Note this can be further optimized, as copyContigSize can be even + // smaller when lowering, depending on contiguity and mask alignment + // (see AsyncCopyGlobalToLocalOpConversion) + auto elemBitWidth = dstTy.getElementTypeBitWidth(); + auto regToSharedLayout = getRegToSharedLayout(rewriter.getContext(), + srcTy.getShape(), blockEnc, sharedEnc, elemBitWidth); + auto copyContigSize = regToSharedLayout->getNumConsecutiveInOut(); + + // obtain block sizePerThread along contig dim + auto sizePerThread = blockEnc.getSizePerThread(); + auto blockContigSize = sizePerThread[blockEnc.getOrder()[0]]; - if (newSizePerThread == blockEnc.getSizePerThread()) + if (blockContigSize <= copyContigSize) return rewriter.notifyMatchFailure(copyOp, - "at least one dimension of blocked sizePerThread must be greater than shared vec"); + "blocked sizePerThread along contiguous dim must be greater than the " + "max contiguous copy size "); + + sizePerThread[blockEnc.getOrder()[0]] = copyContigSize; // obtain new blockedEnc based on clipped sizePerThread auto mod = copyOp->getParentOfType(); int numWarps = TritonGPUDialect::getNumWarps(mod); int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod); auto newBlockEnc = BlockedEncodingAttr::get( - copyOp.getContext(), inputTy.getShape(), newSizePerThread, + copyOp.getContext(), srcTy.getShape(), sizePerThread, blockEnc.getOrder(), numWarps, threadsPerWarp, blockEnc.getCTALayout()); From f1af158789cb0c1722151ca7c922301dc97f0b88 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 21 Nov 2024 21:56:11 +0000 Subject: [PATCH 3/8] Document and format --- .../triton/Conversion/TritonGPUToLLVM/Utility.h | 7 ++++--- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 17 +++++++++-------- .../TritonGPU/Transforms/CoalesceAsyncCopy.cpp | 16 ++++++++-------- test/TritonGPU/coalesce-async-copy.mlir | 1 - 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 52372dd0effa..c51e700c6ce6 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -1129,9 +1129,10 @@ SmallVector> emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, Attribute layout, RankedTensorType type, bool withCTAOffset); -// TODO document -std::optional getRegToSharedLayout(MLIRContext* ctx, - ArrayRef shape, Attribute srcEnc, Attribute dstEnc, int elemBitWidth); +// Returns composed LinearLayout for register to shared copy +std::optional +getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, + Attribute srcEnc, Attribute dstEnc, int elemBitWidth); // Emits IR to load data from shared memory into registers, or to store data // from registers into shared memory. diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index fe2441713943..2b687c943af4 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -158,15 +158,16 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, return ret; } -std::optional getRegToSharedLayout(MLIRContext* ctx, - ArrayRef shape, Attribute srcEnc, Attribute dstEnc, int elemBitWidth) { +std::optional +getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, + Attribute srcEnc, Attribute dstEnc, int elemBitWidth) { StringAttr kBlock = str_attr("block"); int rank = shape.size(); std::optional regLayout = triton::gpu::toLinearLayout(shape, srcEnc); - std::optional sharedLayout = triton::gpu::toLinearLayout( - shape, dstEnc, elemBitWidth); + std::optional sharedLayout = + triton::gpu::toLinearLayout(shape, dstEnc, elemBitWidth); if (!regLayout.has_value() || !sharedLayout.has_value()) { return std::nullopt; } @@ -175,8 +176,7 @@ std::optional getRegToSharedLayout(MLIRContext* ctx, // sharedLayout's in-dims are currently (offset, block). Reshape to // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional // shmem strides. (The offsetX's appear in minor-to-major order.) - auto sharedLegacy = - cast(dstEnc); + auto sharedLegacy = cast(dstEnc); SmallVector> multiDimSharedSize; for (int i = 0; i < rank; i++) { int dim = sharedOrder[i]; @@ -210,8 +210,9 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kLane = str_attr("lane"); StringAttr kWarp = str_attr("warp"); - auto regToSharedLayout = getRegToSharedLayout(ctx, shape, registerTy.getEncoding(), - sharedTy.getEncoding(), elemLlvmTy.getIntOrFloatBitWidth()); + auto regToSharedLayout = getRegToSharedLayout( + ctx, shape, registerTy.getEncoding(), sharedTy.getEncoding(), + elemLlvmTy.getIntOrFloatBitWidth()); if (!regToSharedLayout.has_value()) return false; diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index cf0ffca18982..7fcfbabcf97e 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -1,10 +1,10 @@ #include "mlir/Support/LLVM.h" #include "mlir/Transforms/Passes.h" #include "triton/Analysis/Utility.h" +#include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include @@ -56,8 +56,9 @@ struct ClipAsyncCopySizePerThread // smaller when lowering, depending on contiguity and mask alignment // (see AsyncCopyGlobalToLocalOpConversion) auto elemBitWidth = dstTy.getElementTypeBitWidth(); - auto regToSharedLayout = getRegToSharedLayout(rewriter.getContext(), - srcTy.getShape(), blockEnc, sharedEnc, elemBitWidth); + auto regToSharedLayout = + getRegToSharedLayout(rewriter.getContext(), srcTy.getShape(), blockEnc, + sharedEnc, elemBitWidth); auto copyContigSize = regToSharedLayout->getNumConsecutiveInOut(); // obtain block sizePerThread along contig dim @@ -65,7 +66,8 @@ struct ClipAsyncCopySizePerThread auto blockContigSize = sizePerThread[blockEnc.getOrder()[0]]; if (blockContigSize <= copyContigSize) - return rewriter.notifyMatchFailure(copyOp, + return rewriter.notifyMatchFailure( + copyOp, "blocked sizePerThread along contiguous dim must be greater than the " "max contiguous copy size "); @@ -77,8 +79,7 @@ struct ClipAsyncCopySizePerThread int threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod); auto newBlockEnc = BlockedEncodingAttr::get( copyOp.getContext(), srcTy.getShape(), sizePerThread, - blockEnc.getOrder(), numWarps, threadsPerWarp, - blockEnc.getCTALayout()); + blockEnc.getOrder(), numWarps, threadsPerWarp, blockEnc.getCTALayout()); // insert cvt's after src, mask, and other auto convertBlockLayout = [&](Value src, BlockedEncodingAttr enc) { @@ -105,8 +106,7 @@ struct ClipAsyncCopySizePerThread }; class CoalesceAsyncCopyPass - : public impl::TritonGPUCoalesceAsyncCopyBase< - CoalesceAsyncCopyPass> { + : public impl::TritonGPUCoalesceAsyncCopyBase { public: void runOnOperation() override { ModuleOp m = getOperation(); diff --git a/test/TritonGPU/coalesce-async-copy.mlir b/test/TritonGPU/coalesce-async-copy.mlir index 6da63d6e341a..3076e640ec01 100644 --- a/test/TritonGPU/coalesce-async-copy.mlir +++ b/test/TritonGPU/coalesce-async-copy.mlir @@ -17,4 +17,3 @@ tt.func @async_copy_i8(%input: tensor<64x16x!tt.ptr, #blocked>, tt.return } } - From 2124a0626632b7908e17c9e07fcb0c644b763ca0 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 21 Nov 2024 22:24:07 +0000 Subject: [PATCH 4/8] Fix random typo --- lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index 7fcfbabcf97e..23e4fd353824 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -47,7 +47,7 @@ struct ClipAsyncCopySizePerThread Value other = copyOp.getOther(); auto srcTy = cast(src.getType()); auto blockEnc = cast(srcTy.getEncoding()); - auto dstTy = cast(copyOp.getResult().getType()); + auto dstTy = cast(copyOp.getResult().getType()); auto sharedEnc = cast(dstTy.getEncoding()); auto sharedVec = sharedEnc.getVec(); From 5b0f4adc6bfcffecef4ffc80030a8e93cf1d1882 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Thu, 21 Nov 2024 23:50:25 +0000 Subject: [PATCH 5/8] Move memdesc to ttg in lit test --- test/TritonGPU/coalesce-async-copy.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/TritonGPU/coalesce-async-copy.mlir b/test/TritonGPU/coalesce-async-copy.mlir index 3076e640ec01..133ea0c5e4e1 100644 --- a/test/TritonGPU/coalesce-async-copy.mlir +++ b/test/TritonGPU/coalesce-async-copy.mlir @@ -10,7 +10,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @async_copy_i8(%input: tensor<64x16x!tt.ptr, #blocked>, - %view: !tt.memdesc<64x16xi8, #shared, #triton_gpu.shared_memory, mutable>, + %view: !triton_gpu.memdesc<64x16xi8, #shared, #triton_gpu.shared_memory, mutable>, %mask: tensor<64x16xi1, #blocked>, %other: tensor<64x16xi8, #blocked>) { %token = triton_gpu.async_copy_global_to_local %input, %view mask %mask other %other: tensor<64x16x!tt.ptr, #blocked> -> <64x16xi8, #shared, #triton_gpu.shared_memory, mutable> From d3a50e9a3658721fe708d62f942453a88683ae25 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Fri, 22 Nov 2024 18:31:33 +0000 Subject: [PATCH 6/8] Address comments --- .../Conversion/TritonGPUToLLVM/Utility.h | 5 --- .../Dialect/TritonGPU/Transforms/Utility.h | 5 +++ lib/Conversion/TritonGPUToLLVM/Utility.cpp | 37 +------------------ .../Transforms/CoalesceAsyncCopy.cpp | 15 +++++--- lib/Dialect/TritonGPU/Transforms/Utility.cpp | 36 ++++++++++++++++++ 5 files changed, 51 insertions(+), 47 deletions(-) diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index c51e700c6ce6..ba24461a1f6d 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -1129,11 +1129,6 @@ SmallVector> emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, Attribute layout, RankedTensorType type, bool withCTAOffset); -// Returns composed LinearLayout for register to shared copy -std::optional -getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, - Attribute srcEnc, Attribute dstEnc, int elemBitWidth); - // Emits IR to load data from shared memory into registers, or to store data // from registers into shared memory. // diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h index f1e361f64d66..0f6bd57afaf1 100644 --- a/include/triton/Dialect/TritonGPU/Transforms/Utility.h +++ b/include/triton/Dialect/TritonGPU/Transforms/Utility.h @@ -202,6 +202,11 @@ enum class MMALoadType { // pipelining }; MMALoadType getMMALoadType(Operation *loadOp); + +// Returns composed LinearLayout for register to shared copy +std::optional +getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, + Attribute srcEnc, Attribute dstEnc, int elemBitWidth); } // namespace mlir #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_ diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 2b687c943af4..49f05a758e42 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -4,6 +4,7 @@ #include "triton/Conversion/TritonGPUToLLVM/TargetInfoBase.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" +#include "triton/Dialect/TritonGPU/Transforms/Utility.h" #include "llvm/ADT/STLExtras.h" namespace mlir { @@ -158,42 +159,6 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, return ret; } -std::optional -getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, - Attribute srcEnc, Attribute dstEnc, int elemBitWidth) { - StringAttr kBlock = str_attr("block"); - int rank = shape.size(); - - std::optional regLayout = - triton::gpu::toLinearLayout(shape, srcEnc); - std::optional sharedLayout = - triton::gpu::toLinearLayout(shape, dstEnc, elemBitWidth); - if (!regLayout.has_value() || !sharedLayout.has_value()) { - return std::nullopt; - } - auto sharedOrder = triton::gpu::getOrder(dstEnc); - - // sharedLayout's in-dims are currently (offset, block). Reshape to - // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional - // shmem strides. (The offsetX's appear in minor-to-major order.) - auto sharedLegacy = cast(dstEnc); - SmallVector> multiDimSharedSize; - for (int i = 0; i < rank; i++) { - int dim = sharedOrder[i]; - int64_t size = std::max( - int64_t{1}, - shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]); - multiDimSharedSize.push_back( - {str_attr("offset" + std::to_string(dim)), size}); - } - multiDimSharedSize.push_back({kBlock, sharedLayout->getInDimSize(kBlock)}); - sharedLayout = sharedLayout->reshapeIns(multiDimSharedSize); - - // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1, - // ..., offsetXN, block), where the offsetX's are in minor-to-major order. - return regLayout->invertAndCompose(*sharedLayout); -} - bool emitTransferBetweenRegistersAndShared( RankedTensorType registerTy, triton::gpu::MemDescType sharedTy, Type elemLlvmTy, std::optional maxVecElems, Value shmemBase, diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index 23e4fd353824..cb6ffbe55aad 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -46,8 +46,11 @@ struct ClipAsyncCopySizePerThread Value mask = copyOp.getMask(); Value other = copyOp.getOther(); auto srcTy = cast(src.getType()); - auto blockEnc = cast(srcTy.getEncoding()); auto dstTy = cast(copyOp.getResult().getType()); + auto blockEnc = dyn_cast(srcTy.getEncoding()); + if (!blockEnc) + return rewriter.notifyMatchFailure(copyOp, + "src must be of blocked encoding"); auto sharedEnc = cast(dstTy.getEncoding()); auto sharedVec = sharedEnc.getVec(); @@ -95,11 +98,11 @@ struct ClipAsyncCopySizePerThread if (other) other = convertBlockLayout(other, newBlockEnc); - // replace the asyncCopy - auto newCopyOp = rewriter.create( - copyOp.getLoc(), src, copyOp.getResult(), mask, other, - copyOp.getCache(), copyOp.getEvict(), copyOp.getIsVolatile()); - rewriter.replaceOp(copyOp, newCopyOp); + rewriter.modifyOpInPlace(copyOp, [&]() { + copyOp.getSrcMutable().assign(src); + copyOp.getMaskMutable().assign(mask); + copyOp.getOtherMutable().assign(other); + }); return success(); } diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index b8f3abfcaca8..7effc18825aa 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -1153,4 +1153,40 @@ void populateForOpDeadArgumentElimination(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } +std::optional +getRegToSharedLayout(MLIRContext *ctx, ArrayRef shape, + Attribute srcEnc, Attribute dstEnc, int elemBitWidth) { + StringAttr kBlock = StringAttr::get(ctx, ("block")); + int rank = shape.size(); + + std::optional regLayout = + triton::gpu::toLinearLayout(shape, srcEnc); + std::optional sharedLayout = + triton::gpu::toLinearLayout(shape, dstEnc, elemBitWidth); + if (!regLayout.has_value() || !sharedLayout.has_value()) { + return std::nullopt; + } + auto sharedOrder = triton::gpu::getOrder(dstEnc); + + // sharedLayout's in-dims are currently (offset, block). Reshape to + // (offsetX1, offsetX2, ..., block) so that we can apply the N-dimensional + // shmem strides. (The offsetX's appear in minor-to-major order.) + auto sharedLegacy = cast(dstEnc); + SmallVector> multiDimSharedSize; + for (int i = 0; i < rank; i++) { + int dim = sharedOrder[i]; + int64_t size = std::max( + int64_t{1}, + shape[dim] / sharedLegacy.getCTALayout().getCTASplitNum()[dim]); + multiDimSharedSize.push_back( + {StringAttr::get(ctx, ("offset" + std::to_string(dim))), size}); + } + multiDimSharedSize.push_back({kBlock, sharedLayout->getInDimSize(kBlock)}); + sharedLayout = sharedLayout->reshapeIns(multiDimSharedSize); + + // regToSharedLayout maps from (register, lane, warp, block) to (offsetX1, + // ..., offsetXN, block), where the offsetX's are in minor-to-major order. + return regLayout->invertAndCompose(*sharedLayout); +} + } // namespace mlir From 82e9f63246e52fe14751b44a88bd80d1171d10e7 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Sat, 23 Nov 2024 02:14:29 +0000 Subject: [PATCH 7/8] Fix bug and add test --- .../TritonGPU/Transforms/CoalesceAsyncCopy.cpp | 6 ++++-- test/TritonGPU/coalesce-async-copy.mlir | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index cb6ffbe55aad..8b3a1ef5cb2f 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -100,8 +100,10 @@ struct ClipAsyncCopySizePerThread rewriter.modifyOpInPlace(copyOp, [&]() { copyOp.getSrcMutable().assign(src); - copyOp.getMaskMutable().assign(mask); - copyOp.getOtherMutable().assign(other); + if (mask) + copyOp.getMaskMutable().assign(mask); + if (other) + copyOp.getOtherMutable().assign(other); }); return success(); diff --git a/test/TritonGPU/coalesce-async-copy.mlir b/test/TritonGPU/coalesce-async-copy.mlir index 133ea0c5e4e1..4707ddaca9cb 100644 --- a/test/TritonGPU/coalesce-async-copy.mlir +++ b/test/TritonGPU/coalesce-async-copy.mlir @@ -17,3 +17,19 @@ tt.func @async_copy_i8(%input: tensor<64x16x!tt.ptr, #blocked>, tt.return } } + +// ----- + +// CHECK: #[[NEW_BLOCKED:.*]] = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [16, 2], warpsPerCTA = [4, 1], order = [1, 0]}> +// CHECK: %{{.*}} = triton_gpu.convert_layout %{{.*}} : {{.*}} -> tensor<64x16x!tt.ptr, #[[NEW_BLOCKED]]> +// CHECK: %{{.*}} = triton_gpu.async_copy_global_to_local %{{.*}}: tensor<64x16x!tt.ptr, #[[NEW_BLOCKED]]> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +tt.func @async_copy_i8_no_mask_or_other(%input: tensor<64x16x!tt.ptr, #blocked>, + %view: !triton_gpu.memdesc<64x16xi8, #shared, #triton_gpu.shared_memory, mutable>) { + %token = triton_gpu.async_copy_global_to_local %input, %view : tensor<64x16x!tt.ptr, #blocked> -> <64x16xi8, #shared, #triton_gpu.shared_memory, mutable> + tt.return +} +} From 3d9be5a5e8b92a7ae9166a0aaf0019afa07fd773 Mon Sep 17 00:00:00 2001 From: Gary Geng Date: Mon, 25 Nov 2024 19:31:01 +0000 Subject: [PATCH 8/8] Remove unused includes --- lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp index 8b3a1ef5cb2f..2d634fc6fa7b 100644 --- a/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp +++ b/lib/Dialect/TritonGPU/Transforms/CoalesceAsyncCopy.cpp @@ -1,15 +1,9 @@ #include "mlir/Support/LLVM.h" #include "mlir/Transforms/Passes.h" #include "triton/Analysis/Utility.h" -#include "triton/Conversion/TritonGPUToLLVM/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" -#include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include - -namespace tt = mlir::triton; - namespace mlir { namespace triton { namespace gpu {