From 4173051347387cf14dcbcb6078886494a460ef0a Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sat, 14 Jun 2025 22:21:07 -0700 Subject: [PATCH 01/23] [Backend] Add a PaddedSharedEncodingAttr definition --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 101 ++++++++++- lib/Dialect/TritonGPU/IR/Dialect.cpp | 166 +++++++++++++++--- test/TritonGPU/invalid-attributes.mlir | 30 ++++ 3 files changed, 263 insertions(+), 34 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index d97f18785543..a8d3143312d4 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -167,13 +167,14 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> { ]; } -def SwizzledSharedEncodingAttr : - TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> { +def SwizzledSharedEncodingAttr + : TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", + [SharedEncodingTrait, LayoutEncodingTrait]> { let mnemonic = "swizzled_shared"; let description = [{ An encoding for tensors whose elements may be simultaneously accessed by -different cuda threads in the programs, via shared memory. In other words, +different GPU threads in the programs, via shared memory. In other words, for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}. In order to avoid shared memory bank conflicts, elements may be swizzled. @@ -181,7 +182,7 @@ Here are some examples. In all cases, the input tensor is [0, 1, ..., n-1]. 1. Basic swizzling - #shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}> + #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}> [ 0, 1, 2, 3], // xor with 0 [ 5, 4, 7, 6], // xor with 1 [10, 11, 8, 9], // xor with 2 @@ -192,7 +193,7 @@ out[r][c^r]). 2. Multiple rows per phase - #shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}> + #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}> [ 0, 1, 2, 3], // phase 0 (xor with 0) [ 4, 5, 6, 7], [ 9, 8, 11, 10], // phase 1 (xor with 1) @@ -203,7 +204,7 @@ means that pairs of 2 rows get the same swizzling. 3. Max-phase applied - $shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}> + #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}> [ 0, 1, 2, 3], // phase 0 (xor with 0) [ 5, 4, 7, 6], // phase 1 (xor with 1) [ 8, 9, 10, 11], // phase 0 @@ -218,7 +219,7 @@ effect of limiting the maximum value of the xor to m-1. 4. Max-phase and per-phase - #shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}> + #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}> [ 0, 1, 2, 3], // phase 0 (xor with 0) [ 4, 5, 6, 7], // phase 0 [ 9, 8, 11, 10], // phase 1 (xor with 1) @@ -234,7 +235,7 @@ maximum value of maxPhase-1. In other words, elements of row r are xor'ed with 5. Adding vec - #shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}> + #ttg.swizzled_shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}> [ 0, 1, 2, 3, 4, 5, 6, 7], [10, 11, 8, 9, 14, 15, 12, 13], [20, 21, 22, 23, 16, 17, 18, 19], @@ -372,7 +373,7 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at }]>, ]; - let extraClassDeclaration = extraBaseClassDeclaration # [{ + let extraClassDeclaration = extraBaseClassDeclaration#[{ unsigned getRank() const { return getCTAOrder().size(); } int32_t getAlignment() const; SmallVector getCTAsPerCGA() const; @@ -383,6 +384,88 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at let genVerifyDecl = 1; } +def PaddeddSharedEncodingAttr + : TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding", + [SharedEncodingTrait, LayoutEncodingTrait]> { + let mnemonic = "padded_shared"; + + let description = [{ +An encoding for tensors whose elements may be simultaneously accessed by +different GPU threads in the programs, via shared memory. In other words, +for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}. +Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid +shared memory bank conflicts. + +Formally, given a layout: + padded_shared<[:+, :+, ...]> +We insert a padding of `` elements after every `` elements. +Multi interval-pad pairs are supported for flexibility of multi tiered padding +schemes; they compose in an additive manner. So for a 1-D tensor element at +index i, the corresponding shared memory location index is + i + \sum_{k} (i / interval_k) * pad_k = 1 +`` and `` all need to be power of two. + +Some concrete examples, using `eM` to mean tensor elements and `pN` to mean +padding: + +1. Single interval-pad pair: + + #ttg.padded_shared<[2:+2]> + [e0, e1, p0, p1, + e2, e3, p2, p3, + ...] + +2. Double interval-pad pairs: + + #ttg.padded_shared<[2:+1, 4:+2]> + [e0, e1, p0, + e2, e3, p1, p2, p3, + e4, e5, p4, + e6, e7, p5, p6, p7, + ...] + +In addition to interval-pad pairs, this encoding requires an `order` to +specify the logical tensor dimenions from the fastest-to slowest-varying. +It may optionally support CGA level organization like other encoding +attributes too, for example, + #ttg.padded_shared<[2:+1, 4:+2] { + order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], + CTAOrder = [0, 1]}> + }]; + + let parameters = (ins ArrayRefParameter<"unsigned">:$intervals, + ArrayRefParameter<"unsigned">:$paddings, + // Order of logical tensor dimensions; fastest-varying first. + ArrayRefParameter<"unsigned">:$order, "CTALayoutAttr":$CTALayout); + + let builders = + [AttrBuilder<(ins "ArrayRef>":$intervalPads, + "ArrayRef":$order, "CTALayoutAttr":$ctaLayout), + [{ + SmallVector intervals, paddings; + intervals.reserve(intervalPads.size()); + paddings.reserve(intervalPads.size()); + for (auto [interval, padding] : intervalPads) { + intervals.push_back(interval); + paddings.push_back(padding); + } + return get(context, intervals, paddings, order, ctaLayout); + }]>, + ]; + + let extraClassDeclaration = extraBaseClassDeclaration#[{ + int32_t getAlignment() const { return 16; } + + unsigned getRank() const { return getOrder().size(); } + + SmallVector getCTAsPerCGA() const; + SmallVector getCTAOrder() const; + SmallVector getCTASplitNum() const; + }]; + let hasCustomAssemblyFormat = 1; + let genVerifyDecl = 1; +} + def NVMMASharedEncodingAttr : TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> { let mnemonic = "nvmma_shared"; diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 90f0f502e1ae..c85d1783fcd4 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -5,6 +5,7 @@ #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/IR/OperationSupport.h" #include "mlir/Support/LLVM.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/Triton/IR/Interfaces.h" @@ -20,7 +21,9 @@ #include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" #include "triton/Tools/Sys/GetEnv.hpp" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/MathExtras.h" // Include TableGen'erated code #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc" @@ -641,6 +644,16 @@ SmallVector SwizzledSharedEncodingAttr::getCTASplitNum() const { return SmallVector(getCTALayout().getCTASplitNum()); } +SmallVector PaddedSharedEncodingAttr::getCTAsPerCGA() const { + return llvm::to_vector(getCTALayout().getCTAsPerCGA()); +} +SmallVector PaddedSharedEncodingAttr::getCTAOrder() const { + return llvm::to_vector(getCTALayout().getCTAOrder()); +} +SmallVector PaddedSharedEncodingAttr::getCTASplitNum() const { + return llvm::to_vector(getCTALayout().getCTASplitNum()); +} + int32_t AMDRotatingSharedEncodingAttr::getAlignment() const { return 16; } SmallVector AMDRotatingSharedEncodingAttr::getCTAsPerCGA() const { @@ -1492,6 +1505,35 @@ void SliceEncodingAttr::print(mlir::AsmPrinter &printer) const { // Helper shared encoding functions //===----------------------------------------------------------------------===// +std::optional +parseCTAAttrs(AsmParser &parser, NamedAttrList attrList, unsigned rank) { + std::optional> CTAsPerCGA; + std::optional> CTASplitNum; + std::optional> CTAOrder; + + for (const NamedAttribute &attr : attrList) { + if (attr.getName() == "CTAsPerCGA") { + if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA") + .failed()) + return {}; + } else if (attr.getName() == "CTASplitNum") { + if (parseIntArrayAttr(parser, attr, CTASplitNum.emplace(), "CTASplitNum") + .failed()) + return {}; + } else if (attr.getName() == "CTAOrder") { + if (parseIntArrayAttr(parser, attr, CTAOrder.emplace(), "CTAOrder") + .failed()) + return {}; + } else { + parser.emitError(parser.getNameLoc(), "unexpected key: ") + << attr.getName().strref(); + return {}; + } + } + + return getCTALayoutOrError(parser, CTAsPerCGA, CTASplitNum, CTAOrder, rank); +} + template Attribute parseSwizzledEncoding(AsmParser &parser, Type type) { if (parser.parseLess().failed()) @@ -1507,9 +1549,7 @@ Attribute parseSwizzledEncoding(AsmParser &parser, Type type) { unsigned perPhase = 0; unsigned maxPhase = 0; SmallVector order; - std::optional> CTAsPerCGA; - std::optional> CTASplitNum; - std::optional> CTAOrder; + NamedAttrList remainingAttrs; for (const NamedAttribute &attr : dict) { if (attr.getName() == "vec") { if (parseUInt(parser, attr, vec, "vec").failed()) @@ -1523,32 +1563,15 @@ Attribute parseSwizzledEncoding(AsmParser &parser, Type type) { } else if (attr.getName() == "order") { if (parseIntArrayAttr(parser, attr, order, "order").failed()) return {}; - } else if (attr.getName() == "CTAsPerCGA") { - if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA") - .failed()) - return {}; - } else if (attr.getName() == "CTASplitNum") { - if (parseIntArrayAttr(parser, attr, CTASplitNum.emplace(), "CTASplitNum") - .failed()) - return {}; - } else if (attr.getName() == "CTAOrder") { - if (parseIntArrayAttr(parser, attr, CTAOrder.emplace(), "CTAOrder") - .failed()) - return {}; } else { - parser.emitError(parser.getNameLoc(), "unexpected key: ") - << attr.getName().strref(); - return {}; + remainingAttrs.push_back(attr); } } - std::optional CTALayout = getCTALayoutOrError( - parser, CTAsPerCGA, CTASplitNum, CTAOrder, /*rank=*/order.size()); - if (!CTALayout.has_value()) - return {}; - - return parser.getChecked(parser.getContext(), vec, perPhase, - maxPhase, order, *CTALayout); + if (auto CTALayout = parseCTAAttrs(parser, remainingAttrs, order.size())) + return parser.getChecked( + parser.getContext(), vec, perPhase, maxPhase, order, *CTALayout); + return {}; } //===----------------------------------------------------------------------===// @@ -1583,6 +1606,99 @@ void SwizzledSharedEncodingAttr::print(AsmPrinter &printer) const { printer << "}>"; } +//===----------------------------------------------------------------------===// +// PaddedShared encoding +//===----------------------------------------------------------------------===// + +Attribute PaddedSharedEncodingAttr::parse(AsmParser &parser, Type type) { + // <[ + if (failed(parser.parseLess()) || failed(parser.parseLSquare())) + return {}; + + // :+ + SmallVector intervals, paddings; + auto parseIntervalPaddingPair = [&]() { + unsigned interval = 0, padding = 0; + if (failed(parser.parseInteger(interval)) || failed(parser.parseColon()) || + failed(parser.parsePlus()) || failed(parser.parseInteger(padding))) + return failure(); + intervals.push_back(interval); + paddings.push_back(padding); + return success(); + }; + // ] + if (failed(parser.parseCommaSeparatedList(parseIntervalPaddingPair)) || + failed(parser.parseRSquare())) + return {}; + + // {}> + NamedAttrList attrList; + if (failed(parser.parseOptionalAttrDict(attrList)) || + failed(parser.parseGreater())) + return {}; + + // Decode order and CTA attributes + SmallVector order; + NamedAttrList remainingAttrs; + for (const NamedAttribute &attr : attrList) { + if (attr.getName() == "order") { + if (parseIntArrayAttr(parser, attr, order, "order").failed()) + return {}; + } else { + remainingAttrs.push_back(attr); + } + } + if (auto ctaLayout = parseCTAAttrs(parser, remainingAttrs, order.size())) + return parser.getChecked( + parser.getContext(), intervals, paddings, order, *ctaLayout); + return {}; +} + +void PaddedSharedEncodingAttr::print(AsmPrinter &printer) const { + printer << "<["; + llvm::interleaveComma(llvm::zip(getIntervals(), getPaddings()), printer, + [&](std::tuple intervalPad) { + printer << std::get<0>(intervalPad) << ":+" + << std::get<1>(intervalPad); + }); + printer << "] {order = [" << getOrder() << "]"; + maybePrintCTALayout(getContext(), printer, getCTALayout(), + /*rank=*/getOrder().size()); + printer << "}>"; +} + +LogicalResult PaddedSharedEncodingAttr::verify( + function_ref emitError, ArrayRef intervals, + ArrayRef paddings, ArrayRef order, + CTALayoutAttr ctaLayout) { + if (intervals.size() != paddings.size()) + return emitError() << "intervals size (" << intervals.size() + << ") must match paddings size (" << paddings.size() + << ")"; + + if (intervals.empty()) + return emitError() << "must have at least one interval-padding pair"; + + if (!llvm::all_of(intervals, llvm::isPowerOf2_32)) + return emitError() << "interval values must all be power of two"; + if (!llvm::all_of(paddings, llvm::isPowerOf2_32)) + return emitError() << "padding values must all be power of two"; + + llvm::SmallSet intervalValues(intervals.begin(), + intervals.end()); + if (intervalValues.size() != intervals.size()) + return emitError() << "interval values cannot have duplicates"; + + if (order.empty()) + return emitError() << "order cannot be empty"; + + if (order.size() != ctaLayout.getRank()) + return emitError() << "order size (" << order.size() + << ") must match CTALayout rank (" << ctaLayout.getRank() + << ")"; + return verifyLayoutOrder(emitError, order); +} + //===----------------------------------------------------------------------===// // NVMMAShared encoding //===----------------------------------------------------------------------===// diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir index df693a6ea81c..3a2aac907096 100644 --- a/test/TritonGPU/invalid-attributes.mlir +++ b/test/TritonGPU/invalid-attributes.mlir @@ -76,3 +76,33 @@ // expected-error@+1 {{(M, N) cases other than (32, 32) or (16, 16) unimplemented}} #mfma = #ttg.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 1], instrShape = [16, 8], isTransposed = false}> + +// ----- + +// expected-error@+1 {{interval values must all be power of two}} +#shared = #ttg.padded_shared<[3:+2]> + +// ----- + +// expected-error@+1 {{padding values must all be power of two}} +#shared = #ttg.padded_shared<[2:+3]> + +// ----- + +// expected-error@+1 {{interval values cannot have duplicates}} +#shared = #ttg.padded_shared<[2:+1, 2:+4]> + +// ----- + +// expected-error@+1 {{order cannot be empty}} +#shared = #ttg.padded_shared<[2:+1, 4:+2]> + +// ----- + +// expected-error@+1 {{unexpected key: unknown}} +#shared = #ttg.padded_shared<[2:+1, 4:+2] {order = [1, 0], unknown = 5}> + +// ----- + +// expected-error@+1 {{order size (3) must match CTALayout rank (2)}} +#shared = #ttg.padded_shared<[2:+1, 4:+2] {order = [2, 1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}> From 8e8bb8817e0737bf21b1e3509d7044bb1b42dc06 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sun, 15 Jun 2025 11:31:50 -0700 Subject: [PATCH 02/23] Support PaddedSharedEncodingAttr in LLVM lowering --- .../Conversion/TritonGPUToLLVM/Utility.h | 6 --- .../TritonGPU/IR/LinearLayoutConversions.h | 2 + .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 3 +- include/triton/Tools/LinearLayout.h | 14 ++++++ lib/Conversion/TritonGPUToLLVM/Utility.cpp | 45 +++++++++++-------- .../TritonGPU/IR/LinearLayoutConversions.cpp | 24 +++++++--- lib/Tools/LinearLayout.cpp | 16 ++++++- .../SharedToDotOperandWMMA.cpp | 5 ++- .../lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp | 3 +- 9 files changed, 84 insertions(+), 34 deletions(-) diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h index 04ca702fc932..fc8ec9d3efd1 100644 --- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h +++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h @@ -537,12 +537,6 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target, const TargetInfoBase &target, std::function perVectorCallback); -[[nodiscard]] bool emitTransferBetweenRegistersAndShared( - LinearLayout ®Layout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy, - std::optional maxVecElems, const SharedMemoryObject &smemObj, - Location loc, RewriterBase &rewriter, const TargetInfoBase &target, - std::function perVectorCallback); - [[nodiscard]] bool emitTransferBetweenRegistersAndShared( LinearLayout ®Layout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy, std::optional maxVecElems, const SharedMemoryObject &smemObj, diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index e458d425be39..8bec87dd57be 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -49,6 +49,8 @@ class AMDMfmaEncodingAttr; // Returns std::nullopt if the given layout can't be converted to an LL. LinearLayout toLinearLayout(ArrayRef shape, Attribute layout); +PaddedLayout toPaddedLayout(ArrayRef shape, Attribute layout); + // Convert the shared encoding of a tensor with `nvmma_shared` layout to a // LinearLayout that maps from a linear shared memory offset to tensor index. // diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index a8d3143312d4..a436cb4a0e61 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -454,9 +454,10 @@ attributes too, for example, ]; let extraClassDeclaration = extraBaseClassDeclaration#[{ + unsigned getRank() const { return getOrder().size(); } int32_t getAlignment() const { return 16; } - unsigned getRank() const { return getOrder().size(); } + PaddedLayout toPaddedLayout(ArrayRef shape) const; SmallVector getCTAsPerCGA() const; SmallVector getCTAOrder() const; diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index 30735db0a308..6359975c2f03 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -840,6 +840,20 @@ class ColumnAction { std::string toString() const; }; +class PaddedLayout { +public: + PaddedLayout(LinearLayout linearMapping, ArrayRef intervals, + ArrayRef paddings); + + const LinearLayout &getLinearMapping() const { return linearMapping; } + + std::optional getMinInterval() const; + +private: + LinearLayout linearMapping; + SmallVector> intervalPads; +}; + } // namespace mlir::triton #endif // TRITON_TOOLS_LINEARLAYOUT_H diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 6323bbff6047..24db4e9c5c33 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -8,6 +8,7 @@ #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/Support/MathExtras.h" #if defined(_MSC_VER) && !defined(__clang__) // from https://gist.github.com/pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0 @@ -469,6 +470,17 @@ Value getSmemVecAddr(const LinearLayout ®Layout, Value baseToAllocBaseDist = dot(rewriter, loc, smemOffsets, smemStrides); smemOffset = b.sub(smemOffset, baseToAllocBaseDist); } + if (auto paddedLayout = + dyn_cast(sharedEnc)) { + Value padOffset = b.i32_val(0); + for (auto [interval, padding] : llvm::zip_equal( + paddedLayout.getIntervals(), paddedLayout.getPaddings())) { + Value iVal = b.i32_val(llvm::Log2_32(interval)); + Value pVal = b.i32_val(llvm::Log2_32(padding)); + padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal)); + } + smemOffset = b.add(smemOffset, padOffset); + } auto ptrTy = smemBase.getType(); auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset, LLVM::GEPNoWrapFlags::inbounds); @@ -477,17 +489,6 @@ Value getSmemVecAddr(const LinearLayout ®Layout, } // namespace -bool emitTransferBetweenRegistersAndShared( - LinearLayout ®Layout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy, - std::optional maxVecElems, const SharedMemoryObject &smemObj, - Location loc, RewriterBase &rewriter, const TargetInfoBase &target, - std::function perVectorCallback) { - auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc); - return emitTransferBetweenRegistersAndShared( - regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter, - target, laneId, warpId, perVectorCallback); -} - bool emitTransferBetweenRegistersAndShared( LinearLayout ®Layout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy, std::optional maxVecElems, const SharedMemoryObject &smemObj, @@ -503,9 +504,10 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - LinearLayout sharedLayout = - triton::gpu::toLinearLayout(shape, sharedTy.getEncoding()); - LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLayout); + PaddedLayout sharedLayout = + triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding()); + LinearLayout regToSharedLayout = + regLayout.invertAndCompose(sharedLayout.getLinearMapping()); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. @@ -530,9 +532,10 @@ bool emitTransferBetweenRegistersAndShared( // // It's OK if the vector width we choose here is wider than the hardware // supports; LLVM will legalize it. - const int vecElems = - std::min(regToSharedLayout.getNumConsecutiveInOut(), - maxVecElems.value_or(std::numeric_limits::max())); + const int vecElems = std::min( + {regToSharedLayout.getNumConsecutiveInOut(), + sharedLayout.getMinInterval().value_or(std::numeric_limits::max()), + maxVecElems.value_or(std::numeric_limits::max())}); auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1; Value blockId = @@ -572,9 +575,10 @@ bool emitTransferBetweenRegistersAndShared( std::function perVectorCallback) { auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(), registerTy.getEncoding()); + auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc); return emitTransferBetweenRegistersAndShared( regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter, - target, perVectorCallback); + target, laneId, warpId, perVectorCallback); } SmallVector loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp, @@ -762,10 +766,13 @@ bool isSimpleSharedMemoryAccess(ArrayRef shape, ArrayRef allocShape, triton::gpu::SharedEncodingTrait sharedEnc) { auto rank = shape.size(); + auto paddedLayout = + dyn_cast(sharedEnc); auto swizzledLayout = dyn_cast(sharedEnc); auto nvmmaLayout = dyn_cast(sharedEnc); - bool noSwizzling = (swizzledLayout && swizzledLayout.getMaxPhase() == 1) || + bool noSwizzling = paddedLayout || + (swizzledLayout && swizzledLayout.getMaxPhase() == 1) || (nvmmaLayout && nvmmaLayout.getSwizzlingByteWidth() == 0); return /*no swizzling*/ noSwizzling || /*swizzling but same shape*/ shape == allocShape || diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 0ac56a8a78ef..0739496baa03 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1,7 +1,5 @@ #include -#include "triton/Dialect/Triton/IR/Dialect.h" -#include "triton/Dialect/Triton/IR/Utility.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" @@ -11,13 +9,10 @@ #include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -using mlir::triton::ScaleDotElemType; - namespace mlir::triton::gpu { namespace { @@ -274,6 +269,15 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared, } // namespace +PaddedLayout +PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef shape) const { + auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( + getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), + getCTALayout()); + LinearLayout ll = swizzledSharedToLinearLayout(shape, nonSwizzleAttr); + return PaddedLayout(ll, getIntervals(), getPaddings()); +} + LinearLayout nvmmaSharedToLinearLayout(ArrayRef shape, NVMMASharedEncodingAttr shared, bool disableSwizzle) { @@ -1125,6 +1129,16 @@ LinearLayout toLinearLayout(ArrayRef shape, Attribute layout) { layout); } +PaddedLayout toPaddedLayout(ArrayRef shape, Attribute layout) { + auto *ctx = layout.getContext(); + if (auto paddedLayout = dyn_cast(layout)) { + return paddedLayout.toPaddedLayout(shape); + } + auto ll = + ctx->getLoadedDialect()->toLinearLayout(shape, layout); + return PaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{}); +} + LinearLayout getLayoutWithinBlock(const LinearLayout &layout) { assert(!layout.getInDimNames().empty()); MLIRContext *ctx = layout.getInDimNames().begin()->getContext(); diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index eea4b0f6bf56..0958e3292e4a 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -8,7 +8,6 @@ #include "third_party/f2reduce/f2reduce.h" #include "triton/Tools/LayoutUtils.h" #include "triton/Tools/StrUtil.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/StringRef.h" @@ -1338,4 +1337,19 @@ std::string ColumnAction::toString() const { return ret; } +PaddedLayout::PaddedLayout(LinearLayout linearMapping, + ArrayRef intervals, + ArrayRef paddings) + : linearMapping(std::move(linearMapping)) { + intervalPads.reserve(intervals.size()); + for (auto [i, p] : llvm::zip_equal(intervals, paddings)) + intervalPads.emplace_back(i, p); +} + +std::optional PaddedLayout::getMinInterval() const { + if (intervalPads.empty()) + return std::nullopt; + return *llvm::min_element(llvm::make_first_range(intervalPads)); +} + } // namespace mlir::triton diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp index 221d3b849d1b..77fc628b446e 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp @@ -154,7 +154,10 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter, auto aTensorTy = cast(tensor.getType()); ArrayRef shape = aTensorTy.getShape(); - auto sharedLayout = cast(aTensorTy.getEncoding()); + auto sharedLayout = + dyn_cast(aTensorTy.getEncoding()); + if (!sharedLayout) + return Value(); auto order = sharedLayout.getOrder(); // Rely on the linear layout conversion logic in this case, since only slowest diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp index dcc70aa64198..a0c4f2083b19 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp @@ -271,10 +271,11 @@ struct TransLocalLoadOpConversion SmallVector outVals; SmallVector elemsI32; mlir::Type retTy = dstTy; + auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc); bool valid = emitTransferBetweenRegistersAndShared( ldsTransLayout, srcTy, llvmElemTy, /*maxVecElems=*/std::nullopt, smemObj, loc, rewriter, targetInfo, - [&](VectorType vecTy, Value vecAddr) { + laneId, warpId, [&](VectorType vecTy, Value vecAddr) { if (bitwidth == 16) { auto dsReadOp = rewriter.create(loc, vecTy, vecAddr); From ddebf3cd01f5f97785ec78b5c86a34797fa3b09f Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 17 Jun 2025 23:47:21 +0000 Subject: [PATCH 03/23] Add new padded shared layout attr builder --- .../triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index a436cb4a0e61..c5abb281930c 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -450,7 +450,15 @@ attributes too, for example, paddings.push_back(padding); } return get(context, intervals, paddings, order, ctaLayout); - }]>, + }]>, + AttrBuilder<(ins "ArrayRef":$shape, "ArrayRef":$order, + "unsigned":$dotKWidth, "unsigned":$elemBitWidth, + "CTALayoutAttr":$ctaLayout), + [{ + unsigned inner = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]]; + unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u); + return get(context, {{inner, threadNumBytes}}, order, ctaLayout); + }]>, ]; let extraClassDeclaration = extraBaseClassDeclaration#[{ From 03b802a6f5df66da068074053ae699c328f1842c Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Mon, 16 Jun 2025 03:18:37 +0000 Subject: [PATCH 04/23] Fix LLVM lowering issues --- include/triton/Tools/LinearLayout.h | 2 ++ lib/Conversion/TritonGPUToLLVM/Utility.cpp | 17 ++++++++++------- lib/Dialect/TritonGPU/IR/Dialect.cpp | 11 ++++++----- lib/Tools/LinearLayout.cpp | 6 ++++++ 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index 6359975c2f03..d62107c2d071 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -849,6 +849,8 @@ class PaddedLayout { std::optional getMinInterval() const; + bool hasNoPadding() const; + private: LinearLayout linearMapping; SmallVector> intervalPads; diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 24db4e9c5c33..4b180958f478 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -504,10 +504,10 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - PaddedLayout sharedLayout = + PaddedLayout paddedLayout = triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = - regLayout.invertAndCompose(sharedLayout.getLinearMapping()); + regLayout.invertAndCompose(paddedLayout.getLinearMapping()); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. @@ -534,7 +534,7 @@ bool emitTransferBetweenRegistersAndShared( // supports; LLVM will legalize it. const int vecElems = std::min( {regToSharedLayout.getNumConsecutiveInOut(), - sharedLayout.getMinInterval().value_or(std::numeric_limits::max()), + paddedLayout.getMinInterval().value_or(std::numeric_limits::max()), maxVecElems.value_or(std::numeric_limits::max())}); auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1; @@ -549,10 +549,13 @@ bool emitTransferBetweenRegistersAndShared( // take out the "block" dimension. // Thus we use `pseudoinvert` instead of `invert` here for simplicity. auto allocShape = sharedTy.getAllocShape(); - LinearLayout invertAllocSharedLayout = - triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()), - sharedTy.getEncoding()) - .pseudoinvert(); + auto invertAllocSharedLayout = LinearLayout::empty(); + if (paddedLayout.hasNoPadding()) { + invertAllocSharedLayout = + triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()), + sharedTy.getEncoding()) + .pseudoinvert(); + } int numElems = regToSharedLayout.getInDimSize(kRegister); auto vecTy = vec_ty(elemLlvmTy, vecElems); diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index c85d1783fcd4..aceefd55078e 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -176,18 +176,19 @@ SmallVector getRepOrder(RankedTensorType type) { // This one's not terribly bad as we don't broadcast ShareEncodings SmallVector getOrder(SharedEncodingTrait layout, ArrayRef shape) { - if (auto swizzledLayout = - mlir::dyn_cast(layout)) { + if (auto swizzledLayout = dyn_cast(layout)) { return llvm::to_vector(swizzledLayout.getOrder()); } - if (auto sharedLayout = mlir::dyn_cast(layout)) { + if (auto paddedLayout = dyn_cast(layout)) { + return llvm::to_vector(paddedLayout.getOrder()); + } + if (auto sharedLayout = dyn_cast(layout)) { if (shape.size() == 1) { return {0}; } return getMatrixOrder(shape.size(), !sharedLayout.getTransposed()); } - if (auto sharedLayout = - mlir::dyn_cast(layout)) { + if (auto sharedLayout = dyn_cast(layout)) { return llvm::to_vector(sharedLayout.getOrder()); } llvm::report_fatal_error("Unimplemented usage of getOrder for MemDescType"); diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 0958e3292e4a..214407c6d628 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1352,4 +1352,10 @@ std::optional PaddedLayout::getMinInterval() const { return *llvm::min_element(llvm::make_first_range(intervalPads)); } +bool PaddedLayout::hasNoPadding() const { + return intervalPads.empty() || + llvm::all_of(llvm::make_second_range(intervalPads), + [](unsigned v) { return v == 0; }); +} + } // namespace mlir::triton From 3bdcc7eed39cd6ddb2d0d6c76566f23fd0081463 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 17 Jun 2025 18:27:36 +0000 Subject: [PATCH 05/23] Fix more llvm lowering issues --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 4 ++++ lib/Analysis/Allocation.cpp | 16 +++++++++----- lib/Dialect/TritonGPU/IR/Dialect.cpp | 22 ++++++++++++++++++- .../TritonGPU/IR/LinearLayoutConversions.cpp | 9 -------- 4 files changed, 36 insertions(+), 15 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index c5abb281930c..949b46b42751 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -467,6 +467,10 @@ attributes too, for example, PaddedLayout toPaddedLayout(ArrayRef shape) const; + // Returns the total number of elements including padding given the input + // tensor shape. + int64_t getPaddedSize(ArrayRef shape) const; + SmallVector getCTAsPerCGA() const; SmallVector getCTAOrder() const; SmallVector getCTASplitNum() const; diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index 7b897aaacc11..9dbe0c87108d 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -260,12 +260,18 @@ class AllocationAnalysis { auto alloc = dyn_cast(op); if (!alloc || !alloc.isSharedMemoryAlloc()) return; - // Bytes could be a different value once we support padding or other - // allocation policies. auto allocType = alloc.getType(); - auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType); - auto bytes = - product(shapePerCTA) * allocType.getElementTypeBitWidth() / 8; + int64_t bytes = 0; + if (auto paddedLayout = + dyn_cast(allocType.getEncoding())) { + SmallVector unpaddedShape = gpu::getShapePerCTA(allocType); + bytes = paddedLayout.getPaddedSize(unpaddedShape) * + allocType.getElementTypeBitWidth() / 8; + } else { + auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType); + bytes = product(shapePerCTA) * + allocType.getElementTypeBitWidth() / 8; + } auto alignment = alloc.getAlignmentOrDefault(); allocation->addBuffer(alloc, bytes, diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index aceefd55078e..0c46393930e8 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -316,7 +316,7 @@ SmallVector getShapePerCTA(Attribute layout, ArrayRef shape) { SmallVector getAllocationShapePerCTA(Attribute layout, ArrayRef shapeLogical) { SmallVector shape(shapeLogical); - if (auto sharedMMALayout = mlir::dyn_cast(layout)) { + if (auto sharedMMALayout = dyn_cast(layout)) { if (sharedMMALayout.getFp4Padded()) { auto packedAxis = getOrder(sharedMMALayout, shapeLogical)[0]; shape[packedAxis] *= 2; @@ -1700,6 +1700,26 @@ LogicalResult PaddedSharedEncodingAttr::verify( return verifyLayoutOrder(emitError, order); } +PaddedLayout +PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef shape) const { + auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( + getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), + getCTALayout()); + LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr); + return PaddedLayout(ll, getIntervals(), getPaddings()); +} + +int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { + int64_t unpaddedSize = product(shape); + int64_t paddingSize = 0; + for (auto [interval, padding] : + llvm::zip_equal(getIntervals(), getPaddings())) { + paddingSize += (unpaddedSize >> llvm::Log2_32(interval)) + << llvm::Log2_32(padding); + } + return unpaddedSize + paddingSize; +} + //===----------------------------------------------------------------------===// // NVMMAShared encoding //===----------------------------------------------------------------------===// diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 0739496baa03..1102f2881ac2 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -269,15 +269,6 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared, } // namespace -PaddedLayout -PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef shape) const { - auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( - getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), - getCTALayout()); - LinearLayout ll = swizzledSharedToLinearLayout(shape, nonSwizzleAttr); - return PaddedLayout(ll, getIntervals(), getPaddings()); -} - LinearLayout nvmmaSharedToLinearLayout(ArrayRef shape, NVMMASharedEncodingAttr shared, bool disableSwizzle) { From ae67bba58372728e24773a3a214f0f88066a6765 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 17 Jun 2025 22:57:06 +0000 Subject: [PATCH 06/23] Add allocation tests --- test/Analysis/test-allocation.mlir | 63 ++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir index 3400039ed352..27739e0e561d 100644 --- a/test/Analysis/test-allocation.mlir +++ b/test/Analysis/test-allocation.mlir @@ -21,6 +21,9 @@ #NVMMA_SHARED_64 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}> #NVMMA_SHARED_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}> #NVMMA_SHARED_FP4PADDED = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8, fp4Padded = true}> +#PADDED_SHARED_0 = #ttg.padded_shared<[256:+8] {order = [1, 0]}> +#PADDED_SHARED_1 = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}> +#PADDED_SHARED_2 = #ttg.padded_shared<[64:+2, 128:+4, 256:+8] {order = [1, 0]}> #smem = #ttg.shared_memory @@ -937,4 +940,64 @@ tt.func @nvmma_alignment(%lb : index, %ub : index, %step : index, %A : !tt.ptr !ttg.memdesc<1x255xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 528}} + // (256 + 8) * 2B = 528B + %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<1x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 530}} + // (257 + 8) * 2B = 530B + %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<1x257xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 1038}} + // (511 + 8) * 2B = 1038B + %alloc3 = ttg.local_alloc : () -> !ttg.memdesc<1x511xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 1056}} + // (512 + 8 * 2) * 2B = 1056B + %alloc4 = ttg.local_alloc : () -> !ttg.memdesc<1x512xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 1058}} + // (513 + 8 * 2) * 2B = 1058B + %alloc5 = ttg.local_alloc : () -> !ttg.memdesc<1x513xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 528}} + // (16 * 16 + 8) * 2B = 528B + %alloc6 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 1056}} + // (16 * 32 + 8 * 2) * 2B = 1056B + %alloc7 = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 1008}} + // (31 * 16 + 8) * 2B = 1008B + %alloc8 = ttg.local_alloc : () -> !ttg.memdesc<31x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + tt.return +} + +// expected-remark @below {{padded_shared_layout_element_type}} +// expected-remark @below {{size = 16896}} +tt.func @padded_shared_layout_element_type() { + // expected-remark @+2 {{offset = 0, size = 4224}} + // (16 * 256 + 8 * 16) * 1B = 4224B + %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 8448}} + // (16 * 256 + 8 * 16) * 2B = 8448B + %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 16896}} + // (16 * 256 + 8 * 16) * 4B = 16896B + %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf32, #PADDED_SHARED_0, #ttg.shared_memory, mutable> + tt.return +} + +// expected-remark @below {{padded_shared_layout_multi_tier}} +// expected-remark @below {{size = 4480}} +tt.func @padded_shared_layout_multi_tier() { + // expected-remark @+2 {{offset = 0, size = 4352}} + // (16 * 256 + 4 * 32 + 8 * 16) * 1B = 4352B + %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_1, #ttg.shared_memory, mutable> + // expected-remark @+2 {{offset = 0, size = 4480}} + // (16 * 256 + 2 * 64 + 4 * 32 + 8 * 16) * 1B = 4480B + %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_2, #ttg.shared_memory, mutable> + tt.return +} } From fbb041e559c6fc85c34cb07fcf161a7f1e88fa1c Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 18 Jun 2025 00:46:30 +0000 Subject: [PATCH 07/23] Fix a bunch of small issues --- .../TritonGPU/IR/LinearLayoutConversions.h | 7 ++-- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 18 +++++----- include/triton/Tools/LinearLayout.h | 26 ++++++++++---- lib/Analysis/Allocation.cpp | 9 +++-- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 36 +++++++++++-------- lib/Dialect/TritonGPU/IR/Dialect.cpp | 6 ++-- .../TritonGPU/IR/LinearLayoutConversions.cpp | 14 ++++---- lib/Tools/LinearLayout.cpp | 16 ++++----- 8 files changed, 74 insertions(+), 58 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index 8bec87dd57be..d7ef07bc5bff 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -49,8 +49,6 @@ class AMDMfmaEncodingAttr; // Returns std::nullopt if the given layout can't be converted to an LL. LinearLayout toLinearLayout(ArrayRef shape, Attribute layout); -PaddedLayout toPaddedLayout(ArrayRef shape, Attribute layout); - // Convert the shared encoding of a tensor with `nvmma_shared` layout to a // LinearLayout that maps from a linear shared memory offset to tensor index. // @@ -300,5 +298,10 @@ LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef tileShape, // the two can be done using transferWithinWarp, without involving LDS std::optional chooseMfmaLikeStoreLayout(RankedTensorType valType); +// Convert the given layout to a linear layout with potential additional +// physical memory paddings. +PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, + Attribute layout); + } // namespace mlir::triton::gpu #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 949b46b42751..335ced1f857d 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -373,7 +373,7 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at }]>, ]; - let extraClassDeclaration = extraBaseClassDeclaration#[{ + let extraClassDeclaration = extraBaseClassDeclaration # [{ unsigned getRank() const { return getCTAOrder().size(); } int32_t getAlignment() const; SmallVector getCTAsPerCGA() const; @@ -399,23 +399,23 @@ shared memory bank conflicts. Formally, given a layout: padded_shared<[:+, :+, ...]> We insert a padding of `` elements after every `` elements. -Multi interval-pad pairs are supported for flexibility of multi tiered padding -schemes; they compose in an additive manner. So for a 1-D tensor element at -index i, the corresponding shared memory location index is +Multi interval-padding pairs are supported for flexibility of multi tiered +padding schemes; they compose in an additive manner. So for a 1-D tensor element +at index i, the corresponding shared memory location index is i + \sum_{k} (i / interval_k) * pad_k = 1 `` and `` all need to be power of two. Some concrete examples, using `eM` to mean tensor elements and `pN` to mean padding: -1. Single interval-pad pair: +1. Single interval-padding pair: #ttg.padded_shared<[2:+2]> [e0, e1, p0, p1, e2, e3, p2, p3, ...] -2. Double interval-pad pairs: +2. Double interval-padding pairs: #ttg.padded_shared<[2:+1, 4:+2]> [e0, e1, p0, @@ -424,7 +424,7 @@ padding: e6, e7, p5, p6, p7, ...] -In addition to interval-pad pairs, this encoding requires an `order` to +In addition to interval-padding pairs, this encoding requires an `order` to specify the logical tensor dimenions from the fastest-to slowest-varying. It may optionally support CGA level organization like other encoding attributes too, for example, @@ -461,11 +461,11 @@ attributes too, for example, }]>, ]; - let extraClassDeclaration = extraBaseClassDeclaration#[{ + let extraClassDeclaration = extraBaseClassDeclaration # [{ unsigned getRank() const { return getOrder().size(); } int32_t getAlignment() const { return 16; } - PaddedLayout toPaddedLayout(ArrayRef shape) const; + PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape) const; // Returns the total number of elements including padding given the input // tensor shape. diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index d62107c2d071..cdae51375083 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -840,19 +840,33 @@ class ColumnAction { std::string toString() const; }; -class PaddedLayout { +// A utility class to describe a particular padding schema with 1) a linear +// layout to desribe the N-D logical element mapping and 2) a list of +// interval-padding pairs to describe the 1-D physical shared memory padding +// schedules. In a degenerated cases, we can have no interval-padding pairs so +// it will just be a normal linear layout. +// +// In Triton we use linear layout basically throughout; so this class is useful +// to leverage common linear layout facilities and code paths as much as +// possible, while factoring in shared memory padding wherever necessary--only +// at the very final steps when we allocating the physical shared memory or +// creating pointers indexing into them. All steps before can still reason with +// linear layout. Thus this utility class keeps these two parts separate. +class PaddedLinearLayout { public: - PaddedLayout(LinearLayout linearMapping, ArrayRef intervals, - ArrayRef paddings); + PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, + ArrayRef paddings); - const LinearLayout &getLinearMapping() const { return linearMapping; } + const LinearLayout &getLinear() const { return linear; } + // Returns the minimal interval that would trigger padding. std::optional getMinInterval() const; - bool hasNoPadding() const; + // Returns true if this is not a degenerated case and indeed requires padding. + bool hasPadding() const; private: - LinearLayout linearMapping; + LinearLayout linear; SmallVector> intervalPads; }; diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index 9dbe0c87108d..9325e2309713 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -261,17 +261,16 @@ class AllocationAnalysis { if (!alloc || !alloc.isSharedMemoryAlloc()) return; auto allocType = alloc.getType(); - int64_t bytes = 0; + int64_t numElems = 0; if (auto paddedLayout = dyn_cast(allocType.getEncoding())) { SmallVector unpaddedShape = gpu::getShapePerCTA(allocType); - bytes = paddedLayout.getPaddedSize(unpaddedShape) * - allocType.getElementTypeBitWidth() / 8; + numElems = paddedLayout.getPaddedSize(unpaddedShape); } else { auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType); - bytes = product(shapePerCTA) * - allocType.getElementTypeBitWidth() / 8; + numElems = product(shapePerCTA); } + int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8; auto alignment = alloc.getAlignmentOrDefault(); allocation->addBuffer(alloc, bytes, diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 4b180958f478..451080607efb 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -397,6 +397,10 @@ Value getSmemVecAddr(const LinearLayout ®Layout, // We propose case 2 (see comments below), which provides a more general // solution for all swizzled shared memory scenarios, including the edge case // mentioned above. + // + // Padded shared layout falls into case 1--we can rely on the logic for case 1 + // to get the 1-D offset into shared memory. Then we just need to add the + // padding offset. if (isSimpleSharedMemoryAccess(shape, allocShape, sharedEnc)) { // Case 1 smemOffset = applyLinearLayout(loc, rewriter, regToSharedLayout, {{kRegister, regId}, @@ -425,6 +429,18 @@ Value getSmemVecAddr(const LinearLayout ®Layout, smemOffset = dot(rewriter, loc, smemOffsets, applyPermutation(smemStrides, smemOrder)); } + if (auto paddedLayout = + dyn_cast(sharedEnc)) { + // Apply the offset needed for padding. + Value padOffset = b.i32_val(0); + for (auto [interval, padding] : llvm::zip_equal( + paddedLayout.getIntervals(), paddedLayout.getPaddings())) { + Value iVal = b.i32_val(llvm::Log2_32(interval)); + Value pVal = b.i32_val(llvm::Log2_32(padding)); + padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal)); + } + smemOffset = b.add(smemOffset, padOffset); + } } else { // Case 2 -> rank-reduced swizzling assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2"); assert((isa(sharedEnc)) { - Value padOffset = b.i32_val(0); - for (auto [interval, padding] : llvm::zip_equal( - paddedLayout.getIntervals(), paddedLayout.getPaddings())) { - Value iVal = b.i32_val(llvm::Log2_32(interval)); - Value pVal = b.i32_val(llvm::Log2_32(padding)); - padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal)); - } - smemOffset = b.add(smemOffset, padOffset); - } auto ptrTy = smemBase.getType(); auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset, LLVM::GEPNoWrapFlags::inbounds); @@ -504,10 +509,10 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - PaddedLayout paddedLayout = - triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding()); + PaddedLinearLayout paddedLayout = + triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = - regLayout.invertAndCompose(paddedLayout.getLinearMapping()); + regLayout.invertAndCompose(paddedLayout.getLinear()); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. @@ -550,7 +555,8 @@ bool emitTransferBetweenRegistersAndShared( // Thus we use `pseudoinvert` instead of `invert` here for simplicity. auto allocShape = sharedTy.getAllocShape(); auto invertAllocSharedLayout = LinearLayout::empty(); - if (paddedLayout.hasNoPadding()) { + if (!paddedLayout.hasPadding()) { + // For now this is only needed for the cases where we have swizzling. invertAllocSharedLayout = triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()), sharedTy.getEncoding()) diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 0c46393930e8..592483d5885f 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1700,13 +1700,13 @@ LogicalResult PaddedSharedEncodingAttr::verify( return verifyLayoutOrder(emitError, order); } -PaddedLayout -PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef shape) const { +PaddedLinearLayout +PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef shape) const { auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), getCTALayout()); LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr); - return PaddedLayout(ll, getIntervals(), getPaddings()); + return PaddedLinearLayout(ll, getIntervals(), getPaddings()); } int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 1102f2881ac2..38d0a11effbb 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1120,14 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef shape, Attribute layout) { layout); } -PaddedLayout toPaddedLayout(ArrayRef shape, Attribute layout) { - auto *ctx = layout.getContext(); - if (auto paddedLayout = dyn_cast(layout)) { - return paddedLayout.toPaddedLayout(shape); - } - auto ll = - ctx->getLoadedDialect()->toLinearLayout(shape, layout); - return PaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{}); +PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, + Attribute layout) { + if (auto paddedLayout = dyn_cast(layout)) + return paddedLayout.toPaddedLinearLayout(shape); + auto ll = toLinearLayout(shape, layout); + return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{}); } LinearLayout getLayoutWithinBlock(const LinearLayout &layout) { diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 214407c6d628..35ae36cfa98e 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,25 +1337,21 @@ std::string ColumnAction::toString() const { return ret; } -PaddedLayout::PaddedLayout(LinearLayout linearMapping, - ArrayRef intervals, - ArrayRef paddings) - : linearMapping(std::move(linearMapping)) { +PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear, + ArrayRef intervals, + ArrayRef paddings) + : linear(std::move(linear)) { intervalPads.reserve(intervals.size()); for (auto [i, p] : llvm::zip_equal(intervals, paddings)) intervalPads.emplace_back(i, p); } -std::optional PaddedLayout::getMinInterval() const { +std::optional PaddedLinearLayout::getMinInterval() const { if (intervalPads.empty()) return std::nullopt; return *llvm::min_element(llvm::make_first_range(intervalPads)); } -bool PaddedLayout::hasNoPadding() const { - return intervalPads.empty() || - llvm::all_of(llvm::make_second_range(intervalPads), - [](unsigned v) { return v == 0; }); -} +bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); } } // namespace mlir::triton From d59edb862999d060b2cb30da794c152f3fa2fd41 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 18 Jun 2025 03:21:34 +0000 Subject: [PATCH 08/23] Add linear layout conversion test --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 7 ++++-- .../TritonGPU/LinearLayoutConversionsTest.cpp | 25 +++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 335ced1f857d..0a390b398957 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -433,10 +433,13 @@ attributes too, for example, CTAOrder = [0, 1]}> }]; - let parameters = (ins ArrayRefParameter<"unsigned">:$intervals, + let parameters = (ins + ArrayRefParameter<"unsigned">:$intervals, ArrayRefParameter<"unsigned">:$paddings, // Order of logical tensor dimensions; fastest-varying first. - ArrayRefParameter<"unsigned">:$order, "CTALayoutAttr":$CTALayout); + ArrayRefParameter<"unsigned">:$order, + "CTALayoutAttr":$CTALayout + ); let builders = [AttrBuilder<(ins "ArrayRef>":$intervalPads, diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index dbab14c1091e..a369feb665f2 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3,6 +3,7 @@ #include "mlir/IR/MLIRContext.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Signals.h" @@ -99,6 +100,15 @@ class LinearLayoutConversionsTest : public ::testing::Test { CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd)); } + PaddedSharedEncodingAttr + paddedShared(ArrayRef intervals, ArrayRef paddings, + ArrayRef ord, ArrayRef cpg, + ArrayRef cSplit, ArrayRef cOrd) { + return PaddedSharedEncodingAttr::get( + &ctx, intervals, paddings, ord, + CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd)); + } + NVMMASharedEncodingAttr nvmmaShared(unsigned swizzleSizeInBytes, bool transposed, unsigned elementBitWidth, ArrayRef cpg, @@ -3002,6 +3012,21 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) { {S("dim0"), S("dim1")})); } +TEST_F(LinearLayoutConversionsTest, PaddedShared) { + PaddedLinearLayout pll = + toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0}, + {1, 1}, {1, 1}, {1, 0})); + // The expected linear layout mapping part should just be an identity. + auto expectedLL = LinearLayout( + // clang-format off + {{S("offset"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {0, 32}, + {1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}}}, + {S("block"), {}}}, + {S("dim0"), S("dim1")}); + // clang-format on + EXPECT_EQ(pll.getLinear(), expectedLL); +} + } // anonymous namespace } // namespace mlir::triton::gpu From b62287084cc2c8f272e551082cc439fa4a66ef15 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sun, 15 Jun 2025 16:37:42 -0700 Subject: [PATCH 09/23] Wire up StreamPipeline usage --- include/triton/Tools/Sys/GetEnv.hpp | 1 + python/src/passes.h | 6 +++ python/triton/knobs.py | 1 + third_party/amd/backend/compiler.py | 4 +- .../include/TritonAMDGPUTransforms/Passes.td | 25 +++++---- .../TritonAMDGPUTransforms/StreamPipeline.cpp | 51 +++++++++++-------- third_party/amd/python/triton_amd.cc | 4 +- 7 files changed, 56 insertions(+), 36 deletions(-) diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp index 113126787c28..d54faf3a4da6 100644 --- a/include/triton/Tools/Sys/GetEnv.hpp +++ b/include/triton/Tools/Sys/GetEnv.hpp @@ -37,6 +37,7 @@ inline const std::set CACHE_INVALIDATING_ENV_VARS = { "TRITON_HIP_USE_ASYNC_COPY", "TRITON_HIP_USE_BLOCK_PINGPONG", "TRITON_HIP_USE_IN_THREAD_TRANSPOSE", + "TRITON_HIP_USE_PADDED_SHARED_LAYOUT", "TRITON_LLVM_DEBUG_ONLY", "TRITON_ENABLE_ASAN", "TRITON_OVERRIDE_ARCH", diff --git a/python/src/passes.h b/python/src/passes.h index 629fe362d8b2..66fdf0df1d1c 100644 --- a/python/src/passes.h +++ b/python/src/passes.h @@ -36,3 +36,9 @@ #define ADD_PASS_OPTION_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3) \ m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \ ty3 val3) { pm.addPass(builder({val0, val1, val2, val3})); }) + +#define ADD_PASS_OPTION_WRAPPER_5(name, builder, ty0, ty1, ty2, ty3, ty4) \ + m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \ + ty3 val3, ty4 val4) { \ + pm.addPass(builder({val0, val1, val2, val3, val4})); \ + }) diff --git a/python/triton/knobs.py b/python/triton/knobs.py index b3c70c7174c0..e82fbc93535a 100644 --- a/python/triton/knobs.py +++ b/python/triton/knobs.py @@ -446,6 +446,7 @@ class amd_knobs(base_knobs): global_prefetch: env_int = env_int("TRITON_HIP_GLOBAL_PREFETCH") local_prefetch: env_int = env_int("TRITON_HIP_LOCAL_PREFETCH") use_async_copy: env_bool = env_bool("TRITON_HIP_USE_ASYNC_COPY") + use_padded_shared_layout : env_bool = env_bool("TRITON_HIP_USE_PADDED_SHARED_LAYOUT") scalarize_packed_fops: env_bool = env_bool("AMDGCN_SCALARIZE_PACKED_FOPS") diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index bff54c2604bd..6269adccfd72 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -237,12 +237,14 @@ def make_ttgir(mod, metadata, options): global_prefetch = knobs.amd.global_prefetch local_prefetch = knobs.amd.local_prefetch use_async_copy = knobs.amd.use_async_copy + use_padded_shared_layout = knobs.amd.use_padded_shared_layout # The `local-prefetch` scheduling variant requires turning on buffer ops. if options.schedule_hint == "local-prefetch": global_prefetch = local_prefetch = 1 - amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy) + amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, + use_async_copy, use_padded_shared_layout) if use_async_copy: amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch) passes.common.add_canonicalizer(pm) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index 0d88e56c2e23..e9479a02cc92 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -13,19 +13,18 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"]; - let options = [ - Option<"numStages", "num_stages", - "int32_t", /*default*/"2", - "Number of Pipeline stages">, - Option<"globalPrefetch", "global_prefetch", - "int32_t", /*default*/"0", - "Set global prefetch stage count">, - Option<"localPrefetch", "local_prefetch", - "int32_t", /*default*/"0", - "Set local prefetch stage count">, - Option<"useAsyncCopy", "use_async_copy", - "bool", /*default*/"false", - "Use AsyncCopyGlobalToLocal to directly load to shared memory">, + let options = + [Option<"numStages", "num_stages", "int32_t", /*default*/ "2", + "Number of Pipeline stages">, + Option<"globalPrefetch", "global_prefetch", "int32_t", /*default*/ "0", + "Set global prefetch stage count">, + Option<"localPrefetch", "local_prefetch", "int32_t", /*default*/ "0", + "Set local prefetch stage count">, + Option<"useAsyncCopy", "use_async_copy", "bool", /*default*/ "false", + "Use AsyncCopyGlobalToLocal to directly load to shared memory">, + Option<"usePaddedSharedLayout", "use_padded_shared_layout", "bool", + /*default*/ "false", + "Use padded shared layout for shared memory">, ]; } diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index 0c9af9384958..f8ce51a6a595 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -8,6 +8,7 @@ #include "triton/Dialect/Triton/IR/OpInterfaces.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h" #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" @@ -122,7 +123,7 @@ enum SchedType { struct LoadInfo { // Shared layout is used for loads feeding into dot ops. - ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr; + ttg::SharedEncodingTrait sharedEncoding = nullptr; // The distance of this load's stage to its use' stage. int distToUse = 0; Operation *use = nullptr; @@ -407,22 +408,22 @@ static ttg::AMDMfmaEncodingAttr getDotEncoding(Value inputValue, // If all the transitive uses of the given value have are used by a convert to // the same dot operand encoding, return true and get the shared encoding that // needs to be used to be compatible with users' layouts. -static std::optional -getSharedEncIfAllUsersAreDotEnc(Value loadedValue) { - ttg::SwizzledSharedEncodingAttr attr; +static std::optional +getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) { + ttg::SharedEncodingTrait attr; for (Operation *user : loadedValue.getUsers()) { LDBG(" getSharedEncIfAllUsersAreDotEnc current user: " << *user); if (user->getNumResults() != 1) return std::nullopt; - ttg::SwizzledSharedEncodingAttr tempAttr; + ttg::SharedEncodingTrait tempAttr; Value userResult = user->getResult(0); Type userResType = userResult.getType(); if (auto memDesc = dyn_cast(userResType)) { // First time we find a shared encoding in the chain, save it and try to // use it if it is compatible with the other users. - tempAttr = cast(memDesc.getEncoding()); - if (!getSharedEncIfAllUsersAreDotEnc(userResult).has_value()) + tempAttr = cast(memDesc.getEncoding()); + if (!getSharedEncIfAllUsersAreDotEnc(usePaddedLayout, userResult)) return std::nullopt; } else { if (!isa(user)) @@ -449,9 +450,15 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) { auto userResEnc = cast(userResType).getEncoding(); if (auto dotOpEnc = dyn_cast(userResEnc)) { - tempAttr = ttg::SwizzledSharedEncodingAttr::get( - loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder, - ctaLayout, bitWidth, /*needTrans=*/false); + if (usePaddedLayout) { + tempAttr = ttg::PaddedSharedEncodingAttr::get( + loadedValue.getContext(), srcTy.getShape(), sharedOrder, + dotOpEnc.getKWidth(), bitWidth, ctaLayout); + } else { + tempAttr = ttg::SwizzledSharedEncodingAttr::get( + loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder, + ctaLayout, bitWidth, /*needTrans=*/false); + } } else if (auto llEnc = dyn_cast(userResEnc)) { // We use linear layout directly for scaled dot fp8 operands. For such // cases, we need to look further down the def-use chain to find the dot @@ -481,7 +488,8 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) { // "1" for the load op used by the load op used by the dot op, and so on. FailureOr> findPipelineableLoads(scf::ForOp forOp, - tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) { + tt::ModuleAxisInfoAnalysis &axisInfoAnalysis, + bool usePaddedLayout) { llvm::MapVector loadToInfo; DenseSet seen; // Recursively visit the given op and its operands to discover all load ops @@ -503,7 +511,7 @@ findPipelineableLoads(scf::ForOp forOp, "Block ptr should have been lowered before this pass."); auto ptr = loadOp.getPtr(); if (auto tensorTy = dyn_cast(ptr.getType())) { - ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr; + ttg::SharedEncodingTrait sharedEncoding = nullptr; // Store memory layouts if possible. if (isa(use)) { unsigned vecContiguity = axisInfoAnalysis.getContiguity(ptr); @@ -518,9 +526,9 @@ findPipelineableLoads(scf::ForOp forOp, // Limit shared memory sharing to width >= 32 elements. LDBG("Load " << *loadOp << " has width " << width); if (width >= 32) { - sharedEncoding = - getSharedEncIfAllUsersAreDotEnc(op->getResult(0)) - .value_or(nullptr); + sharedEncoding = getSharedEncIfAllUsersAreDotEnc( + usePaddedLayout, op->getResult(0)) + .value_or(nullptr); } else if (isaFamily != triton::AMD::ISAFamily::CDNA4) { LDBG("Skip width<32 load " << loadOp << " for arch " << arch); return; @@ -795,6 +803,7 @@ SmallVector> createAndScheduleStreamOps( LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, int stages[SCHED_SIZE], bool useAsyncCopy, + bool usePaddedLayout, tt::PipeliningOption &options) { triton::AMD::ModuleAxisInfoAnalysis axisInfoAnalysis( forOp->getParentOfType()); @@ -805,7 +814,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, // Schedule the loads and root ops (dot ops) in the loop. This will give us // a scaffold for the final schedule. FailureOr> loadToInfo = - findPipelineableLoads(forOp, axisInfoAnalysis); + findPipelineableLoads(forOp, axisInfoAnalysis, usePaddedLayout); if (failed(loadToInfo)) return failure(); @@ -875,7 +884,8 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, } LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch, - int localPrefetch, bool useAsyncCopy) { + int localPrefetch, bool useAsyncCopy, + bool usePaddedLayout) { int lastStage = numStages - 1; int stages[SCHED_SIZE]; @@ -903,8 +913,8 @@ LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch, } }; - if (failed(preprocessLoopAndBuildSchedule(forOp, numStages, stages, - useAsyncCopy, options))) + if (failed(preprocessLoopAndBuildSchedule( + forOp, numStages, stages, useAsyncCopy, usePaddedLayout, options))) return failure(); LDBG("Loop before sending to expander:\n" << *forOp); @@ -1002,7 +1012,8 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase { if (!checkPrecondition(forOp)) continue; (void)pipelineLoop(forOp, tt::getNumStagesOrDefault(forOp, numStages), - globalPrefetch, localPrefetch, useAsyncCopy); + globalPrefetch, localPrefetch, useAsyncCopy, + usePaddedSharedLayout); } if (useAsyncCopy) { diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index cab32eda84cb..c56503981a5e 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -77,9 +77,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) { ADD_PASS_WRAPPER_0("add_fold_true_cmpi", mlir::createTritonAMDFoldTrueCmpI); ADD_PASS_OPTION_WRAPPER_1("add_block_pingpong", mlir::createTritonAMDGPUBlockPingpong, int32_t); - ADD_PASS_OPTION_WRAPPER_4("add_stream_pipeline", + ADD_PASS_OPTION_WRAPPER_5("add_stream_pipeline", mlir::createTritonAMDGPUStreamPipeline, int, int, - int, bool); + int, bool, bool); ADD_PASS_OPTION_WRAPPER_1("add_coalesce_async_copy", mlir::createTritonAMDGPUCoalesceAsyncCopy, std::string); From 961ecc4178a7c3f85da92e6499d431b9ffeaaa0e Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 18 Jun 2025 03:28:13 +0000 Subject: [PATCH 10/23] Revert "Wire up StreamPipeline usage" This reverts commit b36f6c3f53883ea762d55b4c39131c243f6e53d5. --- include/triton/Tools/Sys/GetEnv.hpp | 1 - python/src/passes.h | 6 --- python/triton/knobs.py | 1 - third_party/amd/backend/compiler.py | 4 +- .../include/TritonAMDGPUTransforms/Passes.td | 25 ++++----- .../TritonAMDGPUTransforms/StreamPipeline.cpp | 51 ++++++++----------- third_party/amd/python/triton_amd.cc | 4 +- 7 files changed, 36 insertions(+), 56 deletions(-) diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp index d54faf3a4da6..113126787c28 100644 --- a/include/triton/Tools/Sys/GetEnv.hpp +++ b/include/triton/Tools/Sys/GetEnv.hpp @@ -37,7 +37,6 @@ inline const std::set CACHE_INVALIDATING_ENV_VARS = { "TRITON_HIP_USE_ASYNC_COPY", "TRITON_HIP_USE_BLOCK_PINGPONG", "TRITON_HIP_USE_IN_THREAD_TRANSPOSE", - "TRITON_HIP_USE_PADDED_SHARED_LAYOUT", "TRITON_LLVM_DEBUG_ONLY", "TRITON_ENABLE_ASAN", "TRITON_OVERRIDE_ARCH", diff --git a/python/src/passes.h b/python/src/passes.h index 66fdf0df1d1c..629fe362d8b2 100644 --- a/python/src/passes.h +++ b/python/src/passes.h @@ -36,9 +36,3 @@ #define ADD_PASS_OPTION_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3) \ m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \ ty3 val3) { pm.addPass(builder({val0, val1, val2, val3})); }) - -#define ADD_PASS_OPTION_WRAPPER_5(name, builder, ty0, ty1, ty2, ty3, ty4) \ - m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \ - ty3 val3, ty4 val4) { \ - pm.addPass(builder({val0, val1, val2, val3, val4})); \ - }) diff --git a/python/triton/knobs.py b/python/triton/knobs.py index e82fbc93535a..b3c70c7174c0 100644 --- a/python/triton/knobs.py +++ b/python/triton/knobs.py @@ -446,7 +446,6 @@ class amd_knobs(base_knobs): global_prefetch: env_int = env_int("TRITON_HIP_GLOBAL_PREFETCH") local_prefetch: env_int = env_int("TRITON_HIP_LOCAL_PREFETCH") use_async_copy: env_bool = env_bool("TRITON_HIP_USE_ASYNC_COPY") - use_padded_shared_layout : env_bool = env_bool("TRITON_HIP_USE_PADDED_SHARED_LAYOUT") scalarize_packed_fops: env_bool = env_bool("AMDGCN_SCALARIZE_PACKED_FOPS") diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 6269adccfd72..bff54c2604bd 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -237,14 +237,12 @@ def make_ttgir(mod, metadata, options): global_prefetch = knobs.amd.global_prefetch local_prefetch = knobs.amd.local_prefetch use_async_copy = knobs.amd.use_async_copy - use_padded_shared_layout = knobs.amd.use_padded_shared_layout # The `local-prefetch` scheduling variant requires turning on buffer ops. if options.schedule_hint == "local-prefetch": global_prefetch = local_prefetch = 1 - amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, - use_async_copy, use_padded_shared_layout) + amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy) if use_async_copy: amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch) passes.common.add_canonicalizer(pm) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index e9479a02cc92..0d88e56c2e23 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -13,18 +13,19 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"]; - let options = - [Option<"numStages", "num_stages", "int32_t", /*default*/ "2", - "Number of Pipeline stages">, - Option<"globalPrefetch", "global_prefetch", "int32_t", /*default*/ "0", - "Set global prefetch stage count">, - Option<"localPrefetch", "local_prefetch", "int32_t", /*default*/ "0", - "Set local prefetch stage count">, - Option<"useAsyncCopy", "use_async_copy", "bool", /*default*/ "false", - "Use AsyncCopyGlobalToLocal to directly load to shared memory">, - Option<"usePaddedSharedLayout", "use_padded_shared_layout", "bool", - /*default*/ "false", - "Use padded shared layout for shared memory">, + let options = [ + Option<"numStages", "num_stages", + "int32_t", /*default*/"2", + "Number of Pipeline stages">, + Option<"globalPrefetch", "global_prefetch", + "int32_t", /*default*/"0", + "Set global prefetch stage count">, + Option<"localPrefetch", "local_prefetch", + "int32_t", /*default*/"0", + "Set local prefetch stage count">, + Option<"useAsyncCopy", "use_async_copy", + "bool", /*default*/"false", + "Use AsyncCopyGlobalToLocal to directly load to shared memory">, ]; } diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index f8ce51a6a595..0c9af9384958 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -8,7 +8,6 @@ #include "triton/Dialect/Triton/IR/OpInterfaces.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" -#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h" #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" @@ -123,7 +122,7 @@ enum SchedType { struct LoadInfo { // Shared layout is used for loads feeding into dot ops. - ttg::SharedEncodingTrait sharedEncoding = nullptr; + ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr; // The distance of this load's stage to its use' stage. int distToUse = 0; Operation *use = nullptr; @@ -408,22 +407,22 @@ static ttg::AMDMfmaEncodingAttr getDotEncoding(Value inputValue, // If all the transitive uses of the given value have are used by a convert to // the same dot operand encoding, return true and get the shared encoding that // needs to be used to be compatible with users' layouts. -static std::optional -getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) { - ttg::SharedEncodingTrait attr; +static std::optional +getSharedEncIfAllUsersAreDotEnc(Value loadedValue) { + ttg::SwizzledSharedEncodingAttr attr; for (Operation *user : loadedValue.getUsers()) { LDBG(" getSharedEncIfAllUsersAreDotEnc current user: " << *user); if (user->getNumResults() != 1) return std::nullopt; - ttg::SharedEncodingTrait tempAttr; + ttg::SwizzledSharedEncodingAttr tempAttr; Value userResult = user->getResult(0); Type userResType = userResult.getType(); if (auto memDesc = dyn_cast(userResType)) { // First time we find a shared encoding in the chain, save it and try to // use it if it is compatible with the other users. - tempAttr = cast(memDesc.getEncoding()); - if (!getSharedEncIfAllUsersAreDotEnc(usePaddedLayout, userResult)) + tempAttr = cast(memDesc.getEncoding()); + if (!getSharedEncIfAllUsersAreDotEnc(userResult).has_value()) return std::nullopt; } else { if (!isa(user)) @@ -450,15 +449,9 @@ getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) { auto userResEnc = cast(userResType).getEncoding(); if (auto dotOpEnc = dyn_cast(userResEnc)) { - if (usePaddedLayout) { - tempAttr = ttg::PaddedSharedEncodingAttr::get( - loadedValue.getContext(), srcTy.getShape(), sharedOrder, - dotOpEnc.getKWidth(), bitWidth, ctaLayout); - } else { - tempAttr = ttg::SwizzledSharedEncodingAttr::get( - loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder, - ctaLayout, bitWidth, /*needTrans=*/false); - } + tempAttr = ttg::SwizzledSharedEncodingAttr::get( + loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder, + ctaLayout, bitWidth, /*needTrans=*/false); } else if (auto llEnc = dyn_cast(userResEnc)) { // We use linear layout directly for scaled dot fp8 operands. For such // cases, we need to look further down the def-use chain to find the dot @@ -488,8 +481,7 @@ getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) { // "1" for the load op used by the load op used by the dot op, and so on. FailureOr> findPipelineableLoads(scf::ForOp forOp, - tt::ModuleAxisInfoAnalysis &axisInfoAnalysis, - bool usePaddedLayout) { + tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) { llvm::MapVector loadToInfo; DenseSet seen; // Recursively visit the given op and its operands to discover all load ops @@ -511,7 +503,7 @@ findPipelineableLoads(scf::ForOp forOp, "Block ptr should have been lowered before this pass."); auto ptr = loadOp.getPtr(); if (auto tensorTy = dyn_cast(ptr.getType())) { - ttg::SharedEncodingTrait sharedEncoding = nullptr; + ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr; // Store memory layouts if possible. if (isa(use)) { unsigned vecContiguity = axisInfoAnalysis.getContiguity(ptr); @@ -526,9 +518,9 @@ findPipelineableLoads(scf::ForOp forOp, // Limit shared memory sharing to width >= 32 elements. LDBG("Load " << *loadOp << " has width " << width); if (width >= 32) { - sharedEncoding = getSharedEncIfAllUsersAreDotEnc( - usePaddedLayout, op->getResult(0)) - .value_or(nullptr); + sharedEncoding = + getSharedEncIfAllUsersAreDotEnc(op->getResult(0)) + .value_or(nullptr); } else if (isaFamily != triton::AMD::ISAFamily::CDNA4) { LDBG("Skip width<32 load " << loadOp << " for arch " << arch); return; @@ -803,7 +795,6 @@ SmallVector> createAndScheduleStreamOps( LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, int stages[SCHED_SIZE], bool useAsyncCopy, - bool usePaddedLayout, tt::PipeliningOption &options) { triton::AMD::ModuleAxisInfoAnalysis axisInfoAnalysis( forOp->getParentOfType()); @@ -814,7 +805,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, // Schedule the loads and root ops (dot ops) in the loop. This will give us // a scaffold for the final schedule. FailureOr> loadToInfo = - findPipelineableLoads(forOp, axisInfoAnalysis, usePaddedLayout); + findPipelineableLoads(forOp, axisInfoAnalysis); if (failed(loadToInfo)) return failure(); @@ -884,8 +875,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, } LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch, - int localPrefetch, bool useAsyncCopy, - bool usePaddedLayout) { + int localPrefetch, bool useAsyncCopy) { int lastStage = numStages - 1; int stages[SCHED_SIZE]; @@ -913,8 +903,8 @@ LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch, } }; - if (failed(preprocessLoopAndBuildSchedule( - forOp, numStages, stages, useAsyncCopy, usePaddedLayout, options))) + if (failed(preprocessLoopAndBuildSchedule(forOp, numStages, stages, + useAsyncCopy, options))) return failure(); LDBG("Loop before sending to expander:\n" << *forOp); @@ -1012,8 +1002,7 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase { if (!checkPrecondition(forOp)) continue; (void)pipelineLoop(forOp, tt::getNumStagesOrDefault(forOp, numStages), - globalPrefetch, localPrefetch, useAsyncCopy, - usePaddedSharedLayout); + globalPrefetch, localPrefetch, useAsyncCopy); } if (useAsyncCopy) { diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index c56503981a5e..cab32eda84cb 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -77,9 +77,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) { ADD_PASS_WRAPPER_0("add_fold_true_cmpi", mlir::createTritonAMDFoldTrueCmpI); ADD_PASS_OPTION_WRAPPER_1("add_block_pingpong", mlir::createTritonAMDGPUBlockPingpong, int32_t); - ADD_PASS_OPTION_WRAPPER_5("add_stream_pipeline", + ADD_PASS_OPTION_WRAPPER_4("add_stream_pipeline", mlir::createTritonAMDGPUStreamPipeline, int, int, - int, bool, bool); + int, bool); ADD_PASS_OPTION_WRAPPER_1("add_coalesce_async_copy", mlir::createTritonAMDGPUCoalesceAsyncCopy, std::string); From b1c6f946e710af66da3c0336519257bdecf68cb4 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 18 Jun 2025 23:48:36 +0000 Subject: [PATCH 11/23] Add some more tests --- test/Conversion/amd/tritongpu_to_llvm.mlir | 29 ++++++++++++++++++++++ test/TritonGPU/invalid-attributes.mlir | 10 ++++++++ 2 files changed, 39 insertions(+) diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir index aea48d2a4d05..d4e758a24c18 100644 --- a/test/Conversion/amd/tritongpu_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_to_llvm.mlir @@ -380,3 +380,32 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n tt.return } } + +// ----- + +// CHECK-LABEL: padded_shared_layout +#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}> +#shared = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}> +#smem = #ttg.shared_memory +module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} { + tt.func @padded_shared_layout(%arg0: tensor<64x64xf16, #blocked>) { + // CHECK-DAG: %[[CST0:.+]] = llvm.mlir.constant(0 : i32) + // CHECK-DAG: %[[CST2:.+]] = llvm.mlir.constant(2 : i32) + // CHECK-DAG: %[[CST3:.+]] = llvm.mlir.constant(3 : i32) + // CHECK-DAG: %[[CST7:.+]] = llvm.mlir.constant(7 : i32) + // CHECK-DAG: %[[CST8:.+]] = llvm.mlir.constant(8 : i32) + + // CHECK: %[[SHR0:.+]] = llvm.ashr %[[XOR:.+]], %[[CST7]] : i32 + // CHECK-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST2]] : i32 + // CHECK-NEXT: %[[ADD0:.+]] = llvm.add %[[SHL0]], %[[CST0]] : i32 + // CHECK-NEXT: %[[SHR1:.+]] = llvm.ashr %[[XOR]], %[[CST8]] : i32 + // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %14 : i32 + // CHECK-NEXT: %[[ADD1:.+]] = llvm.add %[[ADD0]], %[[SHL1]] : i32 + // CHECK-NEXT: %[[ADD2:.+]] = llvm.add %[[XOR]], %[[ADD1]] : i32 + // CHECK-NEXT: llvm.getelementptr inbounds %{{.+}}[%[[ADD2]]] + + // CHECK-COUNT-16: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3> + %0 = ttg.local_alloc %arg0 : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable> + tt.return + } +} diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir index 3a2aac907096..22938b6055b7 100644 --- a/test/TritonGPU/invalid-attributes.mlir +++ b/test/TritonGPU/invalid-attributes.mlir @@ -84,11 +84,21 @@ // ----- +// expected-error@+1 {{interval values must all be power of two}} +#shared = #ttg.padded_shared<[0:+2]> + +// ----- + // expected-error@+1 {{padding values must all be power of two}} #shared = #ttg.padded_shared<[2:+3]> // ----- +// expected-error@+1 {{padding values must all be power of two}} +#shared = #ttg.padded_shared<[2:+0]> + +// ----- + // expected-error@+1 {{interval values cannot have duplicates}} #shared = #ttg.padded_shared<[2:+1, 2:+4]> From 017b88816fad76ba31d37602414d49f399957a0d Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 00:33:57 +0000 Subject: [PATCH 12/23] Improve PaddedLinearLayout a bit --- include/triton/Tools/LinearLayout.h | 8 +++++--- lib/Tools/LinearLayout.cpp | 15 ++------------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index cdae51375083..d4d7525c5f05 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -855,7 +855,8 @@ class ColumnAction { class PaddedLinearLayout { public: PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, - ArrayRef paddings); + ArrayRef paddings) + : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} const LinearLayout &getLinear() const { return linear; } @@ -863,11 +864,12 @@ class PaddedLinearLayout { std::optional getMinInterval() const; // Returns true if this is not a degenerated case and indeed requires padding. - bool hasPadding() const; + bool hasPadding() const { return !intervals.empty(); } private: LinearLayout linear; - SmallVector> intervalPads; + SmallVector intervals; + SmallVector paddings; }; } // namespace mlir::triton diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 35ae36cfa98e..9e990f141ce1 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,21 +1337,10 @@ std::string ColumnAction::toString() const { return ret; } -PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear, - ArrayRef intervals, - ArrayRef paddings) - : linear(std::move(linear)) { - intervalPads.reserve(intervals.size()); - for (auto [i, p] : llvm::zip_equal(intervals, paddings)) - intervalPads.emplace_back(i, p); -} - std::optional PaddedLinearLayout::getMinInterval() const { - if (intervalPads.empty()) + if (intervals.empty()) return std::nullopt; - return *llvm::min_element(llvm::make_first_range(intervalPads)); + return *llvm::min_element(intervals); } -bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); } - } // namespace mlir::triton From c0f88a888a2f1fb83c0b72e2d3c4e91bf8034666 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 00:46:27 +0000 Subject: [PATCH 13/23] Fix lit test --- test/Conversion/amd/tritongpu_to_llvm.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir index d4e758a24c18..16784f2150b7 100644 --- a/test/Conversion/amd/tritongpu_to_llvm.mlir +++ b/test/Conversion/amd/tritongpu_to_llvm.mlir @@ -399,7 +399,7 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n // CHECK-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST2]] : i32 // CHECK-NEXT: %[[ADD0:.+]] = llvm.add %[[SHL0]], %[[CST0]] : i32 // CHECK-NEXT: %[[SHR1:.+]] = llvm.ashr %[[XOR]], %[[CST8]] : i32 - // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %14 : i32 + // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %[[CST3]] : i32 // CHECK-NEXT: %[[ADD1:.+]] = llvm.add %[[ADD0]], %[[SHL1]] : i32 // CHECK-NEXT: %[[ADD2:.+]] = llvm.add %[[XOR]], %[[ADD1]] : i32 // CHECK-NEXT: llvm.getelementptr inbounds %{{.+}}[%[[ADD2]]] From a66fa0d44f3a37ff5480b782c403517a8abd605f Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 03:33:55 +0000 Subject: [PATCH 14/23] Move builder out to cpp --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 28 ++++++------------- lib/Dialect/TritonGPU/IR/Dialect.cpp | 23 +++++++++++++++ 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 0a390b398957..ff3340d9f79b 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -441,27 +441,15 @@ attributes too, for example, "CTALayoutAttr":$CTALayout ); - let builders = - [AttrBuilder<(ins "ArrayRef>":$intervalPads, - "ArrayRef":$order, "CTALayoutAttr":$ctaLayout), - [{ - SmallVector intervals, paddings; - intervals.reserve(intervalPads.size()); - paddings.reserve(intervalPads.size()); - for (auto [interval, padding] : intervalPads) { - intervals.push_back(interval); - paddings.push_back(padding); - } - return get(context, intervals, paddings, order, ctaLayout); - }]>, - AttrBuilder<(ins "ArrayRef":$shape, "ArrayRef":$order, + let builders = [ + AttrBuilder<(ins "ArrayRef>":$intervalPads, + "ArrayRef":$order, "CTALayoutAttr":$ctaLayout)>, + // Defines a padded shared layout that avoids bank conflicts for a block + // with given |shape| and |order| that is loaded by dot operand with + // kWidth |dotKWidth|. + AttrBuilder<(ins "ArrayRef":$shape, "ArrayRef":$order, "unsigned":$dotKWidth, "unsigned":$elemBitWidth, - "CTALayoutAttr":$ctaLayout), - [{ - unsigned inner = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]]; - unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u); - return get(context, {{inner, threadNumBytes}}, order, ctaLayout); - }]>, + "CTALayoutAttr":$ctaLayout)>, ]; let extraClassDeclaration = extraBaseClassDeclaration # [{ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 592483d5885f..d64b49790ea0 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1700,6 +1700,29 @@ LogicalResult PaddedSharedEncodingAttr::verify( return verifyLayoutOrder(emitError, order); } +PaddedSharedEncodingAttr PaddedSharedEncodingAttr::get( + MLIRContext *context, ArrayRef> intervalPads, + ArrayRef order, CTALayoutAttr ctaLayout) { + SmallVector intervals, paddings; + intervals.reserve(intervalPads.size()); + paddings.reserve(intervalPads.size()); + for (auto [interval, padding] : intervalPads) { + intervals.push_back(interval); + paddings.push_back(padding); + } + return get(context, intervals, paddings, order, ctaLayout); +} + +PaddedSharedEncodingAttr +PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef shape, + ArrayRef order, unsigned dotKWidth, + unsigned elemBitWidth, CTALayoutAttr ctaLayout) { + unsigned innerD = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]]; + unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u); + threadNumBytes = llvm::alignTo(threadNumBytes, 4); // Assume 32-bit per bank + return get(context, {{innerD, threadNumBytes}}, order, ctaLayout); +} + PaddedLinearLayout PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef shape) const { auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( From b5f258ef48dad7c627b6bf3965e77a6ad0a0af0b Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 03:53:52 +0000 Subject: [PATCH 15/23] Improve wording for PaddedLinearLayout once more --- include/triton/Tools/LinearLayout.h | 17 +++++++---------- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 2 +- .../TritonGPU/LinearLayoutConversionsTest.cpp | 2 +- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index d4d7525c5f05..63946b5d17b9 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -840,11 +840,12 @@ class ColumnAction { std::string toString() const; }; -// A utility class to describe a particular padding schema with 1) a linear -// layout to desribe the N-D logical element mapping and 2) a list of -// interval-padding pairs to describe the 1-D physical shared memory padding -// schedules. In a degenerated cases, we can have no interval-padding pairs so -// it will just be a normal linear layout. +// A utility struct to describe either a swizzled or padded shared layout. +// +// For the former, we only use the linear layout field. For the latter, we use +// both 1) a linear layout to desribe the identityStandardND logical element +// mapping and 2) a list of interval-padding pairs to describe the 1-D physical +// shared memory padding schedules. // // In Triton we use linear layout basically throughout; so this class is useful // to leverage common linear layout facilities and code paths as much as @@ -852,21 +853,17 @@ class ColumnAction { // at the very final steps when we allocating the physical shared memory or // creating pointers indexing into them. All steps before can still reason with // linear layout. Thus this utility class keeps these two parts separate. -class PaddedLinearLayout { -public: +struct PaddedLinearLayout { PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, ArrayRef paddings) : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} - const LinearLayout &getLinear() const { return linear; } - // Returns the minimal interval that would trigger padding. std::optional getMinInterval() const; // Returns true if this is not a degenerated case and indeed requires padding. bool hasPadding() const { return !intervals.empty(); } -private: LinearLayout linear; SmallVector intervals; SmallVector paddings; diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 451080607efb..510d8dbb9ed2 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -512,7 +512,7 @@ bool emitTransferBetweenRegistersAndShared( PaddedLinearLayout paddedLayout = triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = - regLayout.invertAndCompose(paddedLayout.getLinear()); + regLayout.invertAndCompose(paddedLayout.linear); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index a369feb665f2..f295e8c2c130 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3024,7 +3024,7 @@ TEST_F(LinearLayoutConversionsTest, PaddedShared) { {S("block"), {}}}, {S("dim0"), S("dim1")}); // clang-format on - EXPECT_EQ(pll.getLinear(), expectedLL); + EXPECT_EQ(pll.linear, expectedLL); } } // anonymous namespace From 2de5b2f65ce2cf77bc8cf3ceed1a57e9861e3065 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 04:03:18 +0000 Subject: [PATCH 16/23] Rename to SwizzledOrPaddedLayout --- .../triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h | 4 ++-- include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 2 +- include/triton/Tools/LinearLayout.h | 6 +++--- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 4 ++-- lib/Dialect/TritonGPU/IR/Dialect.cpp | 6 +++--- lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp | 8 ++++---- lib/Tools/LinearLayout.cpp | 2 +- .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp | 6 +++--- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index d7ef07bc5bff..925c23d2f927 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -300,8 +300,8 @@ std::optional chooseMfmaLikeStoreLayout(RankedTensorType valType); // Convert the given layout to a linear layout with potential additional // physical memory paddings. -PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, - Attribute layout); +SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape, + Attribute layout); } // namespace mlir::triton::gpu #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index ff3340d9f79b..e3191f05c69a 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -456,7 +456,7 @@ attributes too, for example, unsigned getRank() const { return getOrder().size(); } int32_t getAlignment() const { return 16; } - PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape) const; + SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape) const; // Returns the total number of elements including padding given the input // tensor shape. diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index 63946b5d17b9..c4643e35cfe4 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -853,9 +853,9 @@ class ColumnAction { // at the very final steps when we allocating the physical shared memory or // creating pointers indexing into them. All steps before can still reason with // linear layout. Thus this utility class keeps these two parts separate. -struct PaddedLinearLayout { - PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, - ArrayRef paddings) +struct SwizzledOrPaddedLayout { + SwizzledOrPaddedLayout(LinearLayout linear, ArrayRef intervals, + ArrayRef paddings) : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} // Returns the minimal interval that would trigger padding. diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 510d8dbb9ed2..752ddb34328c 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -509,8 +509,8 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - PaddedLinearLayout paddedLayout = - triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); + SwizzledOrPaddedLayout paddedLayout = + triton::gpu::toSwizzledOrPaddedLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = regLayout.invertAndCompose(paddedLayout.linear); diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index d64b49790ea0..5e73b2b9a284 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1723,13 +1723,13 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef shape, return get(context, {{innerD, threadNumBytes}}, order, ctaLayout); } -PaddedLinearLayout -PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef shape) const { +SwizzledOrPaddedLayout PaddedSharedEncodingAttr::toSwizzledOrPaddedLayout( + ArrayRef shape) const { auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), getCTALayout()); LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr); - return PaddedLinearLayout(ll, getIntervals(), getPaddings()); + return SwizzledOrPaddedLayout(ll, getIntervals(), getPaddings()); } int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 38d0a11effbb..57c76ed69bfd 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1120,12 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef shape, Attribute layout) { layout); } -PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, - Attribute layout) { +SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape, + Attribute layout) { if (auto paddedLayout = dyn_cast(layout)) - return paddedLayout.toPaddedLinearLayout(shape); + return paddedLayout.toSwizzledOrPaddedLayout(shape); auto ll = toLinearLayout(shape, layout); - return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{}); + return SwizzledOrPaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{}); } LinearLayout getLayoutWithinBlock(const LinearLayout &layout) { diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 9e990f141ce1..f3630b851aef 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,7 +1337,7 @@ std::string ColumnAction::toString() const { return ret; } -std::optional PaddedLinearLayout::getMinInterval() const { +std::optional SwizzledOrPaddedLayout::getMinInterval() const { if (intervals.empty()) return std::nullopt; return *llvm::min_element(intervals); diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index f295e8c2c130..b781ec7711cb 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3013,9 +3013,9 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) { } TEST_F(LinearLayoutConversionsTest, PaddedShared) { - PaddedLinearLayout pll = - toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0}, - {1, 1}, {1, 1}, {1, 0})); + SwizzledOrPaddedLayout pll = toSwizzledOrPaddedLayout( + {32, 64}, + paddedShared({128, 256}, {4, 8}, {1, 0}, {1, 1}, {1, 1}, {1, 0})); // The expected linear layout mapping part should just be an identity. auto expectedLL = LinearLayout( // clang-format off From 89d069dfef00d8d8cfdf3d170c2f2d46b564ffbf Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 15:18:25 +0000 Subject: [PATCH 17/23] Revert "Rename to SwizzledOrPaddedLayout" This reverts commit 2de5b2f65ce2cf77bc8cf3ceed1a57e9861e3065. --- .../triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h | 4 ++-- include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 2 +- include/triton/Tools/LinearLayout.h | 6 +++--- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 4 ++-- lib/Dialect/TritonGPU/IR/Dialect.cpp | 6 +++--- lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp | 8 ++++---- lib/Tools/LinearLayout.cpp | 2 +- .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp | 6 +++--- 8 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index 925c23d2f927..d7ef07bc5bff 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -300,8 +300,8 @@ std::optional chooseMfmaLikeStoreLayout(RankedTensorType valType); // Convert the given layout to a linear layout with potential additional // physical memory paddings. -SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape, - Attribute layout); +PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, + Attribute layout); } // namespace mlir::triton::gpu #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index e3191f05c69a..ff3340d9f79b 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -456,7 +456,7 @@ attributes too, for example, unsigned getRank() const { return getOrder().size(); } int32_t getAlignment() const { return 16; } - SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape) const; + PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape) const; // Returns the total number of elements including padding given the input // tensor shape. diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index c4643e35cfe4..63946b5d17b9 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -853,9 +853,9 @@ class ColumnAction { // at the very final steps when we allocating the physical shared memory or // creating pointers indexing into them. All steps before can still reason with // linear layout. Thus this utility class keeps these two parts separate. -struct SwizzledOrPaddedLayout { - SwizzledOrPaddedLayout(LinearLayout linear, ArrayRef intervals, - ArrayRef paddings) +struct PaddedLinearLayout { + PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, + ArrayRef paddings) : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} // Returns the minimal interval that would trigger padding. diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 752ddb34328c..510d8dbb9ed2 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -509,8 +509,8 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - SwizzledOrPaddedLayout paddedLayout = - triton::gpu::toSwizzledOrPaddedLayout(shape, sharedTy.getEncoding()); + PaddedLinearLayout paddedLayout = + triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = regLayout.invertAndCompose(paddedLayout.linear); diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index 5e73b2b9a284..d64b49790ea0 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1723,13 +1723,13 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef shape, return get(context, {{innerD, threadNumBytes}}, order, ctaLayout); } -SwizzledOrPaddedLayout PaddedSharedEncodingAttr::toSwizzledOrPaddedLayout( - ArrayRef shape) const { +PaddedLinearLayout +PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef shape) const { auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), getCTALayout()); LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr); - return SwizzledOrPaddedLayout(ll, getIntervals(), getPaddings()); + return PaddedLinearLayout(ll, getIntervals(), getPaddings()); } int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 57c76ed69bfd..38d0a11effbb 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1120,12 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef shape, Attribute layout) { layout); } -SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef shape, - Attribute layout) { +PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, + Attribute layout) { if (auto paddedLayout = dyn_cast(layout)) - return paddedLayout.toSwizzledOrPaddedLayout(shape); + return paddedLayout.toPaddedLinearLayout(shape); auto ll = toLinearLayout(shape, layout); - return SwizzledOrPaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{}); + return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{}); } LinearLayout getLayoutWithinBlock(const LinearLayout &layout) { diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index f3630b851aef..9e990f141ce1 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,7 +1337,7 @@ std::string ColumnAction::toString() const { return ret; } -std::optional SwizzledOrPaddedLayout::getMinInterval() const { +std::optional PaddedLinearLayout::getMinInterval() const { if (intervals.empty()) return std::nullopt; return *llvm::min_element(intervals); diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index b781ec7711cb..f295e8c2c130 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3013,9 +3013,9 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) { } TEST_F(LinearLayoutConversionsTest, PaddedShared) { - SwizzledOrPaddedLayout pll = toSwizzledOrPaddedLayout( - {32, 64}, - paddedShared({128, 256}, {4, 8}, {1, 0}, {1, 1}, {1, 1}, {1, 0})); + PaddedLinearLayout pll = + toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0}, + {1, 1}, {1, 1}, {1, 0})); // The expected linear layout mapping part should just be an identity. auto expectedLL = LinearLayout( // clang-format off From 28c3428bdb7de70a8752177a2617be54f58c2c56 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 15:18:27 +0000 Subject: [PATCH 18/23] Revert "Improve wording for PaddedLinearLayout once more" This reverts commit b5f258ef48dad7c627b6bf3965e77a6ad0a0af0b. --- include/triton/Tools/LinearLayout.h | 17 ++++++++++------- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 2 +- .../TritonGPU/LinearLayoutConversionsTest.cpp | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index 63946b5d17b9..d4d7525c5f05 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -840,12 +840,11 @@ class ColumnAction { std::string toString() const; }; -// A utility struct to describe either a swizzled or padded shared layout. -// -// For the former, we only use the linear layout field. For the latter, we use -// both 1) a linear layout to desribe the identityStandardND logical element -// mapping and 2) a list of interval-padding pairs to describe the 1-D physical -// shared memory padding schedules. +// A utility class to describe a particular padding schema with 1) a linear +// layout to desribe the N-D logical element mapping and 2) a list of +// interval-padding pairs to describe the 1-D physical shared memory padding +// schedules. In a degenerated cases, we can have no interval-padding pairs so +// it will just be a normal linear layout. // // In Triton we use linear layout basically throughout; so this class is useful // to leverage common linear layout facilities and code paths as much as @@ -853,17 +852,21 @@ class ColumnAction { // at the very final steps when we allocating the physical shared memory or // creating pointers indexing into them. All steps before can still reason with // linear layout. Thus this utility class keeps these two parts separate. -struct PaddedLinearLayout { +class PaddedLinearLayout { +public: PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, ArrayRef paddings) : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} + const LinearLayout &getLinear() const { return linear; } + // Returns the minimal interval that would trigger padding. std::optional getMinInterval() const; // Returns true if this is not a degenerated case and indeed requires padding. bool hasPadding() const { return !intervals.empty(); } +private: LinearLayout linear; SmallVector intervals; SmallVector paddings; diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 510d8dbb9ed2..451080607efb 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -512,7 +512,7 @@ bool emitTransferBetweenRegistersAndShared( PaddedLinearLayout paddedLayout = triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); LinearLayout regToSharedLayout = - regLayout.invertAndCompose(paddedLayout.linear); + regLayout.invertAndCompose(paddedLayout.getLinear()); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index f295e8c2c130..a369feb665f2 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3024,7 +3024,7 @@ TEST_F(LinearLayoutConversionsTest, PaddedShared) { {S("block"), {}}}, {S("dim0"), S("dim1")}); // clang-format on - EXPECT_EQ(pll.linear, expectedLL); + EXPECT_EQ(pll.getLinear(), expectedLL); } } // anonymous namespace From 0637bc5a0f3a283d283f5d864f92451f74eaf915 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 15:19:01 +0000 Subject: [PATCH 19/23] Revert "Improve PaddedLinearLayout a bit" This reverts commit 017b88816fad76ba31d37602414d49f399957a0d. --- include/triton/Tools/LinearLayout.h | 8 +++----- lib/Tools/LinearLayout.cpp | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index d4d7525c5f05..cdae51375083 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -855,8 +855,7 @@ class ColumnAction { class PaddedLinearLayout { public: PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, - ArrayRef paddings) - : linear(std::move(linear)), intervals(intervals), paddings(paddings) {} + ArrayRef paddings); const LinearLayout &getLinear() const { return linear; } @@ -864,12 +863,11 @@ class PaddedLinearLayout { std::optional getMinInterval() const; // Returns true if this is not a degenerated case and indeed requires padding. - bool hasPadding() const { return !intervals.empty(); } + bool hasPadding() const; private: LinearLayout linear; - SmallVector intervals; - SmallVector paddings; + SmallVector> intervalPads; }; } // namespace mlir::triton diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 9e990f141ce1..35ae36cfa98e 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,10 +1337,21 @@ std::string ColumnAction::toString() const { return ret; } +PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear, + ArrayRef intervals, + ArrayRef paddings) + : linear(std::move(linear)) { + intervalPads.reserve(intervals.size()); + for (auto [i, p] : llvm::zip_equal(intervals, paddings)) + intervalPads.emplace_back(i, p); +} + std::optional PaddedLinearLayout::getMinInterval() const { - if (intervals.empty()) + if (intervalPads.empty()) return std::nullopt; - return *llvm::min_element(intervals); + return *llvm::min_element(llvm::make_first_range(intervalPads)); } +bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); } + } // namespace mlir::triton From 8fa8d8db1d198babe4675162d4119e24dff10136 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 00:03:43 +0000 Subject: [PATCH 20/23] Drop PaddedLinearLayout --- .../TritonGPU/IR/LinearLayoutConversions.h | 5 ---- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 4 ++- include/triton/Tools/LinearLayout.h | 30 ------------------- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 28 +++++++++++------ lib/Dialect/TritonGPU/IR/Dialect.cpp | 9 ------ .../TritonGPU/IR/LinearLayoutConversions.cpp | 8 ----- lib/Tools/LinearLayout.cpp | 17 ----------- .../TritonGPU/LinearLayoutConversionsTest.cpp | 15 ---------- 8 files changed, 22 insertions(+), 94 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h index d7ef07bc5bff..e458d425be39 100644 --- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h +++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h @@ -298,10 +298,5 @@ LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef tileShape, // the two can be done using transferWithinWarp, without involving LDS std::optional chooseMfmaLikeStoreLayout(RankedTensorType valType); -// Convert the given layout to a linear layout with potential additional -// physical memory paddings. -PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, - Attribute layout); - } // namespace mlir::triton::gpu #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index ff3340d9f79b..e708194349d4 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -456,7 +456,9 @@ attributes too, for example, unsigned getRank() const { return getOrder().size(); } int32_t getAlignment() const { return 16; } - PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape) const; + unsigned getMinInterval() const { + return *llvm::min_element(getIntervals()); + } // Returns the total number of elements including padding given the input // tensor shape. diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h index cdae51375083..30735db0a308 100644 --- a/include/triton/Tools/LinearLayout.h +++ b/include/triton/Tools/LinearLayout.h @@ -840,36 +840,6 @@ class ColumnAction { std::string toString() const; }; -// A utility class to describe a particular padding schema with 1) a linear -// layout to desribe the N-D logical element mapping and 2) a list of -// interval-padding pairs to describe the 1-D physical shared memory padding -// schedules. In a degenerated cases, we can have no interval-padding pairs so -// it will just be a normal linear layout. -// -// In Triton we use linear layout basically throughout; so this class is useful -// to leverage common linear layout facilities and code paths as much as -// possible, while factoring in shared memory padding wherever necessary--only -// at the very final steps when we allocating the physical shared memory or -// creating pointers indexing into them. All steps before can still reason with -// linear layout. Thus this utility class keeps these two parts separate. -class PaddedLinearLayout { -public: - PaddedLinearLayout(LinearLayout linear, ArrayRef intervals, - ArrayRef paddings); - - const LinearLayout &getLinear() const { return linear; } - - // Returns the minimal interval that would trigger padding. - std::optional getMinInterval() const; - - // Returns true if this is not a degenerated case and indeed requires padding. - bool hasPadding() const; - -private: - LinearLayout linear; - SmallVector> intervalPads; -}; - } // namespace mlir::triton #endif // TRITON_TOOLS_LINEARLAYOUT_H diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 451080607efb..170fcdddec86 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -7,6 +7,7 @@ #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#include "triton/Tools/LayoutUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" @@ -509,10 +510,17 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kWarp = str_attr("warp"); auto shape = sharedTy.getShape(); - PaddedLinearLayout paddedLayout = - triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding()); - LinearLayout regToSharedLayout = - regLayout.invertAndCompose(paddedLayout.getLinear()); + auto paddedLayout = + dyn_cast(sharedTy.getEncoding()); + auto sharedLL = LinearLayout::empty(); + if (paddedLayout) { + SmallVector dims(shape); + sharedLL = identityStandardND(Builder(ctx).getStringAttr("offset"), dims, + paddedLayout.getOrder()); + } else { + sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding()); + } + LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLL); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. @@ -537,10 +545,12 @@ bool emitTransferBetweenRegistersAndShared( // // It's OK if the vector width we choose here is wider than the hardware // supports; LLVM will legalize it. - const int vecElems = std::min( - {regToSharedLayout.getNumConsecutiveInOut(), - paddedLayout.getMinInterval().value_or(std::numeric_limits::max()), - maxVecElems.value_or(std::numeric_limits::max())}); + int vecElems = + std::min({regToSharedLayout.getNumConsecutiveInOut(), + maxVecElems.value_or(std::numeric_limits::max())}); + if (paddedLayout) { + vecElems = std::min(vecElems, int(paddedLayout.getMinInterval())); + } auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1; Value blockId = @@ -555,7 +565,7 @@ bool emitTransferBetweenRegistersAndShared( // Thus we use `pseudoinvert` instead of `invert` here for simplicity. auto allocShape = sharedTy.getAllocShape(); auto invertAllocSharedLayout = LinearLayout::empty(); - if (!paddedLayout.hasPadding()) { + if (!paddedLayout) { // For now this is only needed for the cases where we have swizzling. invertAllocSharedLayout = triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()), diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index d64b49790ea0..fd023bd9da7d 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1723,15 +1723,6 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef shape, return get(context, {{innerD, threadNumBytes}}, order, ctaLayout); } -PaddedLinearLayout -PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef shape) const { - auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get( - getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(), - getCTALayout()); - LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr); - return PaddedLinearLayout(ll, getIntervals(), getPaddings()); -} - int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { int64_t unpaddedSize = product(shape); int64_t paddingSize = 0; diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp index 38d0a11effbb..f06526a1a63b 100644 --- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp @@ -1120,14 +1120,6 @@ LinearLayout toLinearLayout(ArrayRef shape, Attribute layout) { layout); } -PaddedLinearLayout toPaddedLinearLayout(ArrayRef shape, - Attribute layout) { - if (auto paddedLayout = dyn_cast(layout)) - return paddedLayout.toPaddedLinearLayout(shape); - auto ll = toLinearLayout(shape, layout); - return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{}); -} - LinearLayout getLayoutWithinBlock(const LinearLayout &layout) { assert(!layout.getInDimNames().empty()); MLIRContext *ctx = layout.getInDimNames().begin()->getContext(); diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp index 35ae36cfa98e..d65859e2a1ad 100644 --- a/lib/Tools/LinearLayout.cpp +++ b/lib/Tools/LinearLayout.cpp @@ -1337,21 +1337,4 @@ std::string ColumnAction::toString() const { return ret; } -PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear, - ArrayRef intervals, - ArrayRef paddings) - : linear(std::move(linear)) { - intervalPads.reserve(intervals.size()); - for (auto [i, p] : llvm::zip_equal(intervals, paddings)) - intervalPads.emplace_back(i, p); -} - -std::optional PaddedLinearLayout::getMinInterval() const { - if (intervalPads.empty()) - return std::nullopt; - return *llvm::min_element(llvm::make_first_range(intervalPads)); -} - -bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); } - } // namespace mlir::triton diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index a369feb665f2..3e40a3970b47 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3012,21 +3012,6 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) { {S("dim0"), S("dim1")})); } -TEST_F(LinearLayoutConversionsTest, PaddedShared) { - PaddedLinearLayout pll = - toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0}, - {1, 1}, {1, 1}, {1, 0})); - // The expected linear layout mapping part should just be an identity. - auto expectedLL = LinearLayout( - // clang-format off - {{S("offset"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {0, 32}, - {1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}}}, - {S("block"), {}}}, - {S("dim0"), S("dim1")}); - // clang-format on - EXPECT_EQ(pll.getLinear(), expectedLL); -} - } // anonymous namespace } // namespace mlir::triton::gpu From e176ed37ea8259e7cf1c971c986d27a5197d6775 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 19 Jun 2025 22:08:46 +0000 Subject: [PATCH 21/23] Use reshapeOuts --- lib/Conversion/TritonGPUToLLVM/Utility.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp index 170fcdddec86..aa495599081e 100644 --- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp +++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp @@ -7,7 +7,7 @@ #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include "triton/Tools/LayoutUtils.h" +#include "triton/Tools/LinearLayout.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" @@ -508,19 +508,19 @@ bool emitTransferBetweenRegistersAndShared( StringAttr kRegister = str_attr("register"); StringAttr kLane = str_attr("lane"); StringAttr kWarp = str_attr("warp"); + StringAttr kOffset = str_attr("offset"); auto shape = sharedTy.getShape(); auto paddedLayout = dyn_cast(sharedTy.getEncoding()); - auto sharedLL = LinearLayout::empty(); + LinearLayout regToSharedLayout = LinearLayout::empty(); if (paddedLayout) { - SmallVector dims(shape); - sharedLL = identityStandardND(Builder(ctx).getStringAttr("offset"), dims, - paddedLayout.getOrder()); + regToSharedLayout = + regLayout.reshapeOuts({{kOffset, regLayout.getTotalOutDimSize()}}); } else { - sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding()); + auto sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding()); + regToSharedLayout = regLayout.invertAndCompose(sharedLL); } - LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLL); // TODO(jlebar): We don't currently support loading from shared memory in a // different CTA. We'd need to emit `mapa.shared::cluster` instructions. From 25221f410334b726fb251549af70a293363c99e5 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 20 Jun 2025 15:10:36 +0000 Subject: [PATCH 22/23] Drop a builder for now --- .../triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 6 ------ lib/Dialect/TritonGPU/IR/Dialect.cpp | 10 ---------- 2 files changed, 16 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index e708194349d4..a8bdf8a54f6d 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -444,12 +444,6 @@ attributes too, for example, let builders = [ AttrBuilder<(ins "ArrayRef>":$intervalPads, "ArrayRef":$order, "CTALayoutAttr":$ctaLayout)>, - // Defines a padded shared layout that avoids bank conflicts for a block - // with given |shape| and |order| that is loaded by dot operand with - // kWidth |dotKWidth|. - AttrBuilder<(ins "ArrayRef":$shape, "ArrayRef":$order, - "unsigned":$dotKWidth, "unsigned":$elemBitWidth, - "CTALayoutAttr":$ctaLayout)>, ]; let extraClassDeclaration = extraBaseClassDeclaration # [{ diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index fe99d70db412..6719783b6581 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -1730,16 +1730,6 @@ PaddedSharedEncodingAttr PaddedSharedEncodingAttr::get( return get(context, intervals, paddings, order, ctaLayout); } -PaddedSharedEncodingAttr -PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef shape, - ArrayRef order, unsigned dotKWidth, - unsigned elemBitWidth, CTALayoutAttr ctaLayout) { - unsigned innerD = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]]; - unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u); - threadNumBytes = llvm::alignTo(threadNumBytes, 4); // Assume 32-bit per bank - return get(context, {{innerD, threadNumBytes}}, order, ctaLayout); -} - int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef shape) const { int64_t unpaddedSize = product(shape); int64_t paddingSize = 0; From c068a7fd3b944e082985b66c5b5bcf61d2654b27 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 20 Jun 2025 15:17:01 +0000 Subject: [PATCH 23/23] Drop not used code --- .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp index 3e40a3970b47..dbab14c1091e 100644 --- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp +++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp @@ -3,7 +3,6 @@ #include "mlir/IR/MLIRContext.h" #include "triton/Dialect/TritonGPU/IR/Attributes.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" -#include "triton/Tools/LinearLayout.h" #include "triton/Tools/StrUtil.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Signals.h" @@ -100,15 +99,6 @@ class LinearLayoutConversionsTest : public ::testing::Test { CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd)); } - PaddedSharedEncodingAttr - paddedShared(ArrayRef intervals, ArrayRef paddings, - ArrayRef ord, ArrayRef cpg, - ArrayRef cSplit, ArrayRef cOrd) { - return PaddedSharedEncodingAttr::get( - &ctx, intervals, paddings, ord, - CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd)); - } - NVMMASharedEncodingAttr nvmmaShared(unsigned swizzleSizeInBytes, bool transposed, unsigned elementBitWidth, ArrayRef cpg,