Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4173051
[Backend] Add a PaddedSharedEncodingAttr definition
antiagainst Jun 15, 2025
8e8bb88
Support PaddedSharedEncodingAttr in LLVM lowering
antiagainst Jun 15, 2025
ddebf3c
Add new padded shared layout attr builder
antiagainst Jun 17, 2025
03b802a
Fix LLVM lowering issues
antiagainst Jun 16, 2025
3bdcc7e
Fix more llvm lowering issues
antiagainst Jun 17, 2025
ae67bba
Add allocation tests
antiagainst Jun 17, 2025
fbb041e
Fix a bunch of small issues
antiagainst Jun 18, 2025
d59edb8
Add linear layout conversion test
antiagainst Jun 18, 2025
b622870
Wire up StreamPipeline usage
antiagainst Jun 15, 2025
961ecc4
Revert "Wire up StreamPipeline usage"
antiagainst Jun 18, 2025
eafb2fd
Merge remote-tracking branch 'origin/main' into padded-shared
antiagainst Jun 18, 2025
b1c6f94
Add some more tests
antiagainst Jun 18, 2025
017b888
Improve PaddedLinearLayout a bit
antiagainst Jun 19, 2025
c0f88a8
Fix lit test
antiagainst Jun 19, 2025
a66fa0d
Move builder out to cpp
antiagainst Jun 19, 2025
b5f258e
Improve wording for PaddedLinearLayout once more
antiagainst Jun 19, 2025
2de5b2f
Rename to SwizzledOrPaddedLayout
antiagainst Jun 19, 2025
89d069d
Revert "Rename to SwizzledOrPaddedLayout"
antiagainst Jun 19, 2025
28c3428
Revert "Improve wording for PaddedLinearLayout once more"
antiagainst Jun 19, 2025
0637bc5
Revert "Improve PaddedLinearLayout a bit"
antiagainst Jun 19, 2025
8fa8d8d
Drop PaddedLinearLayout
antiagainst Jun 19, 2025
e176ed3
Use reshapeOuts
antiagainst Jun 19, 2025
cd9fbad
Merge remote-tracking branch 'origin/main' into padded-shared
antiagainst Jun 19, 2025
25221f4
Drop a builder for now
antiagainst Jun 20, 2025
c068a7f
Drop not used code
antiagainst Jun 20, 2025
72a1f56
Merge remote-tracking branch 'origin/main' into padded-shared
antiagainst Jun 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions include/triton/Conversion/TritonGPUToLLVM/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -537,12 +537,6 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
const TargetInfoBase &target,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);

[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);

[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Expand Down
99 changes: 91 additions & 8 deletions include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
Original file line number Diff line number Diff line change
Expand Up @@ -167,21 +167,22 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
];
}

def SwizzledSharedEncodingAttr :
TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
def SwizzledSharedEncodingAttr
: TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding",
[SharedEncodingTrait, LayoutEncodingTrait]> {
let mnemonic = "swizzled_shared";

let description = [{
An encoding for tensors whose elements may be simultaneously accessed by
different cuda threads in the programs, via shared memory. In other words,
different GPU threads in the programs, via shared memory. In other words,
for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.

In order to avoid shared memory bank conflicts, elements may be swizzled.
Here are some examples. In all cases, the input tensor is [0, 1, ..., n-1].

1. Basic swizzling

#shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
#ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
[ 0, 1, 2, 3], // xor with 0
[ 5, 4, 7, 6], // xor with 1
[10, 11, 8, 9], // xor with 2
Expand All @@ -192,7 +193,7 @@ out[r][c^r]).

2. Multiple rows per phase

#shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
#ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
[ 0, 1, 2, 3], // phase 0 (xor with 0)
[ 4, 5, 6, 7],
[ 9, 8, 11, 10], // phase 1 (xor with 1)
Expand All @@ -203,7 +204,7 @@ means that pairs of 2 rows get the same swizzling.

3. Max-phase applied

$shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
#ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
[ 0, 1, 2, 3], // phase 0 (xor with 0)
[ 5, 4, 7, 6], // phase 1 (xor with 1)
[ 8, 9, 10, 11], // phase 0
Expand All @@ -218,7 +219,7 @@ effect of limiting the maximum value of the xor to m-1.

4. Max-phase and per-phase

#shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
#ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
[ 0, 1, 2, 3], // phase 0 (xor with 0)
[ 4, 5, 6, 7], // phase 0
[ 9, 8, 11, 10], // phase 1 (xor with 1)
Expand All @@ -234,7 +235,7 @@ maximum value of maxPhase-1. In other words, elements of row r are xor'ed with

5. Adding vec

#shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
#ttg.swizzled_shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
[ 0, 1, 2, 3, 4, 5, 6, 7],
[10, 11, 8, 9, 14, 15, 12, 13],
[20, 21, 22, 23, 16, 17, 18, 19],
Expand Down Expand Up @@ -383,6 +384,88 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at
let genVerifyDecl = 1;
}

def PaddeddSharedEncodingAttr
: TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
[SharedEncodingTrait, LayoutEncodingTrait]> {
let mnemonic = "padded_shared";

let description = [{
An encoding for tensors whose elements may be simultaneously accessed by
different GPU threads in the programs, via shared memory. In other words,
for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid
shared memory bank conflicts.

Formally, given a layout:
padded_shared<[<interval_0>:+<pad_0>, <interval_1>:+<pad_1>, ...]>
We insert a padding of `<pad_i>` elements after every `<interval_i>` elements.
Multi interval-padding pairs are supported for flexibility of multi tiered
padding schemes; they compose in an additive manner. So for a 1-D tensor element
at index i, the corresponding shared memory location index is
i + \sum_{k} (i / interval_k) * pad_k = 1
`<interval_i>` and `<pad_i>` all need to be power of two.

Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
padding:

1. Single interval-padding pair:

#ttg.padded_shared<[2:+2]>
[e0, e1, p0, p1,
e2, e3, p2, p3,
...]

2. Double interval-padding pairs:

#ttg.padded_shared<[2:+1, 4:+2]>
[e0, e1, p0,
e2, e3, p1, p2, p3,
e4, e5, p4,
e6, e7, p5, p6, p7,
...]

In addition to interval-padding pairs, this encoding requires an `order` to
specify the logical tensor dimenions from the fastest-to slowest-varying.
It may optionally support CGA level organization like other encoding
attributes too, for example,
#ttg.padded_shared<[2:+1, 4:+2] {
order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1],
CTAOrder = [0, 1]}>
}];

let parameters = (ins
ArrayRefParameter<"unsigned">:$intervals,
ArrayRefParameter<"unsigned">:$paddings,
// Order of logical tensor dimensions; fastest-varying first.
ArrayRefParameter<"unsigned">:$order,
"CTALayoutAttr":$CTALayout
);

let builders = [
AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
"ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
];

let extraClassDeclaration = extraBaseClassDeclaration # [{
unsigned getRank() const { return getOrder().size(); }
int32_t getAlignment() const { return 16; }

unsigned getMinInterval() const {
return *llvm::min_element(getIntervals());
}

// Returns the total number of elements including padding given the input
// tensor shape.
int64_t getPaddedSize(ArrayRef<int64_t> shape) const;

SmallVector<unsigned> getCTAsPerCGA() const;
SmallVector<unsigned> getCTAOrder() const;
SmallVector<unsigned> getCTASplitNum() const;
}];
let hasCustomAssemblyFormat = 1;
let genVerifyDecl = 1;
}

def NVMMASharedEncodingAttr :
TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
let mnemonic = "nvmma_shared";
Expand Down
15 changes: 10 additions & 5 deletions lib/Analysis/Allocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,12 +260,17 @@ class AllocationAnalysis {
auto alloc = dyn_cast<gpu::LocalAllocOp>(op);
if (!alloc || !alloc.isSharedMemoryAlloc())
return;
// Bytes could be a different value once we support padding or other
// allocation policies.
auto allocType = alloc.getType();
auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
auto bytes =
product<int64_t>(shapePerCTA) * allocType.getElementTypeBitWidth() / 8;
int64_t numElems = 0;
if (auto paddedLayout =
dyn_cast<gpu::PaddedSharedEncodingAttr>(allocType.getEncoding())) {
SmallVector<int64_t> unpaddedShape = gpu::getShapePerCTA(allocType);
numElems = paddedLayout.getPaddedSize(unpaddedShape);
Comment on lines +265 to +268
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be better to do it inside getAllocationShapePerCTA

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually was trying to do that. Then I realized it's not that compatible--getAllocationShapePerCTA assumes the original ranked shape, while after factoring in padding fundamentally we only have a 1-D size. Also getAllocationShapePerCTA is used quite a few places that assumes original rank. So ends up I'm doing it this way given only when doing allocation or the final pointer indexing we care about the exact physical memory.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually makes sense.

} else {
auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
numElems = product<int64_t>(shapePerCTA);
}
int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8;

auto alignment = alloc.getAlignmentOrDefault();
allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,
Expand Down
72 changes: 49 additions & 23 deletions lib/Conversion/TritonGPUToLLVM/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
#include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
#include "triton/Dialect/TritonGPU/Transforms/Utility.h"
#include "triton/Tools/LayoutUtils.h"
#include "triton/Tools/LinearLayout.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/MathExtras.h"

#if defined(_MSC_VER) && !defined(__clang__)
// from https://gist.github.com/pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0
Expand Down Expand Up @@ -408,6 +410,10 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
// We propose case 2 (see comments below), which provides a more general
// solution for all swizzled shared memory scenarios, including the edge case
// mentioned above.
//
// Padded shared layout falls into case 1--we can rely on the logic for case 1
// to get the 1-D offset into shared memory. Then we just need to add the
// padding offset.
if (isSimpleSharedMemoryAccess(shape, allocShape, sharedEnc)) { // Case 1
smemOffset = applyLinearLayout(loc, rewriter, regToSharedLayout,
{{kRegister, regId},
Expand Down Expand Up @@ -436,6 +442,18 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
smemOffset = dot(rewriter, loc, smemOffsets,
applyPermutation(smemStrides, smemOrder));
}
if (auto paddedLayout =
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc)) {
// Apply the offset needed for padding.
Value padOffset = b.i32_val(0);
for (auto [interval, padding] : llvm::zip_equal(
paddedLayout.getIntervals(), paddedLayout.getPaddings())) {
Value iVal = b.i32_val(llvm::Log2_32(interval));
Value pVal = b.i32_val(llvm::Log2_32(padding));
padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal));
}
smemOffset = b.add(smemOffset, padOffset);
}
} else { // Case 2 -> rank-reduced swizzling
assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2");
assert((isa<triton::gpu::SwizzledSharedEncodingAttr,
Expand Down Expand Up @@ -628,17 +646,6 @@ SmallVector<Value> lowerLocalLdSt(Location loc, MLIRContext *ctx,
rewriter, targetInfo);
}

bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
return emitTransferBetweenRegistersAndShared(
regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
target, laneId, warpId, perVectorCallback);
}

bool emitTransferBetweenRegistersAndShared(
LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
Expand All @@ -652,11 +659,19 @@ bool emitTransferBetweenRegistersAndShared(
StringAttr kRegister = str_attr("register");
StringAttr kLane = str_attr("lane");
StringAttr kWarp = str_attr("warp");
StringAttr kOffset = str_attr("offset");

auto shape = sharedTy.getShape();
LinearLayout sharedLayout =
triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
auto paddedLayout =
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
LinearLayout regToSharedLayout = LinearLayout::empty();
if (paddedLayout) {
regToSharedLayout =
regLayout.reshapeOuts({{kOffset, regLayout.getTotalOutDimSize()}});
} else {
auto sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
regToSharedLayout = regLayout.invertAndCompose(sharedLL);
}

// TODO(jlebar): We don't currently support loading from shared memory in a
// different CTA. We'd need to emit `mapa.shared::cluster` instructions.
Expand All @@ -681,9 +696,12 @@ bool emitTransferBetweenRegistersAndShared(
//
// It's OK if the vector width we choose here is wider than the hardware
// supports; LLVM will legalize it.
const int vecElems =
std::min(regToSharedLayout.getNumConsecutiveInOut(),
maxVecElems.value_or(std::numeric_limits<int>::max()));
int vecElems =
std::min({regToSharedLayout.getNumConsecutiveInOut(),
maxVecElems.value_or(std::numeric_limits<int>::max())});
if (paddedLayout) {
vecElems = std::min(vecElems, int(paddedLayout.getMinInterval()));
}

auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
Value blockId =
Expand All @@ -697,10 +715,14 @@ bool emitTransferBetweenRegistersAndShared(
// take out the "block" dimension.
// Thus we use `pseudoinvert` instead of `invert` here for simplicity.
auto allocShape = sharedTy.getAllocShape();
LinearLayout invertAllocSharedLayout =
triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
sharedTy.getEncoding())
.pseudoinvert();
auto invertAllocSharedLayout = LinearLayout::empty();
if (!paddedLayout) {
// For now this is only needed for the cases where we have swizzling.
invertAllocSharedLayout =
triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
sharedTy.getEncoding())
.pseudoinvert();
}

int numElems = regToSharedLayout.getInDimSize(kRegister);
auto vecTy = vec_ty(elemLlvmTy, vecElems);
Expand All @@ -723,9 +745,10 @@ bool emitTransferBetweenRegistersAndShared(
std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(),
registerTy.getEncoding());
auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
return emitTransferBetweenRegistersAndShared(
regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
target, perVectorCallback);
target, laneId, warpId, perVectorCallback);
}

SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
Expand Down Expand Up @@ -913,10 +936,13 @@ bool isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
ArrayRef<int64_t> allocShape,
triton::gpu::SharedEncodingTrait sharedEnc) {
auto rank = shape.size();
auto paddedLayout =
dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc);
auto swizzledLayout =
dyn_cast<triton::gpu::SwizzledSharedEncodingAttr>(sharedEnc);
auto nvmmaLayout = dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(sharedEnc);
bool noSwizzling = (swizzledLayout && swizzledLayout.getMaxPhase() == 1) ||
bool noSwizzling = paddedLayout ||
(swizzledLayout && swizzledLayout.getMaxPhase() == 1) ||
(nvmmaLayout && nvmmaLayout.getSwizzlingByteWidth() == 0);
return /*no swizzling*/ noSwizzling ||
/*swizzling but same shape*/ shape == allocShape ||
Expand Down
Loading
Loading