From e1a509297e5c91f3822177f40fe9c4d9ccb16e99 Mon Sep 17 00:00:00 2001 From: SJW Date: Tue, 7 May 2024 20:30:58 +0000 Subject: [PATCH 1/3] [Alloc] Fixed alignment for shared memory allocation * Increment buffers just past max interference buffer --- lib/Analysis/Allocation.cpp | 46 ++++++++++++++++-------------- test/Analysis/test-allocation.mlir | 4 +-- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index bed37bbbf779..2a4053450a0e 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -483,8 +483,7 @@ class AllocationAnalysis { buffers.emplace_back(bufferIter.first); } - DenseMap bufferStart; - calculateStarts(buffers, bufferStart); + calculateStarts(buffers); // NOTE: The original paper doesn't consider interference between // the bumped ranges. Buffers that previously do not interfere with @@ -494,16 +493,15 @@ class AllocationAnalysis { // increase the buffer offset and keep reducing conflicts, we will // eventually reach a fixed point. GraphT interference; - buildInterferenceGraph(buffers, bufferStart, interference); + buildInterferenceGraph(buffers, interference); do { - allocate(buffers, interference, bufferStart); - buildInterferenceGraph(buffers, bufferStart, interference); + allocate(buffers, interference); + buildInterferenceGraph(buffers, interference); } while (!interference.empty()); } /// Computes the initial shared memory offsets. - void calculateStarts(const SmallVector &buffers, - DenseMap &bufferStart) { + void calculateStarts(const SmallVector &buffers) { // v = values in shared memory // t = triplet of (size, start, end) // shared memory space @@ -545,12 +543,13 @@ class AllocationAnalysis { auto xRange = bufferRange.lookup(buffer); // TODO(Keren): A buffer's size shouldn't be determined here, have to // clean it up - size_t alignment = buffer->alignment; - size_t alignSize = ((size + alignment - 1) / alignment) * alignment; - bufferStart[buffer] = alignSize; - tripleMap.insert({alignSize + xSize, - Interval{std::max(range.start(), xRange.start()), - std::min(range.end(), xRange.end())}}); + size_t offset = size; + if (size_t diff = offset % buffer->alignment) + offset += buffer->alignment - diff; + buffer->offset = offset; + tripleMap.insert( + {offset + xSize, Interval{std::max(range.start(), xRange.start()), + std::min(range.end(), xRange.end())}}); // We could either insert (range.start, xRange.start) or (range.start, // xRange.end), both are correct and determine the potential buffer // offset, and the graph coloring algorithm will solve the interference, @@ -567,7 +566,6 @@ class AllocationAnalysis { /// Builds a graph of all shared memory values. Edges are created between /// shared memory values that are overlapping. void buildInterferenceGraph(const SmallVector &buffers, - const DenseMap &bufferStart, GraphT &interference) { // Reset interference graph interference.clear(); @@ -575,8 +573,8 @@ class AllocationAnalysis { for (auto y : buffers) { if (x == y) continue; - auto xStart = bufferStart.lookup(x); - auto yStart = bufferStart.lookup(y); + auto xStart = x->offset; + auto yStart = y->offset; auto xSize = x->size; auto ySize = y->size; Interval xSizeRange = {xStart, xStart + xSize}; @@ -593,8 +591,7 @@ class AllocationAnalysis { /// Finalizes shared memory offsets considering interference. void allocate(const SmallVector &buffers, - const GraphT &interference, - DenseMap &bufferStart) { + const GraphT &interference) { // Reset shared memory size allocation->sharedMemorySize = 0; // First-fit graph coloring @@ -625,12 +622,17 @@ class AllocationAnalysis { // TODO(Keren): We are wasting memory here. // Nodes with color2 can actually start with 24. for (auto x : buffers) { - size_t adj = 0; + size_t newOffset = 0; for (auto y : interference.lookup(x)) { - adj = std::max(adj, bufferStart.lookup(y) + y->size); + newOffset = std::max(newOffset, y->offset + y->size); + } + if (colors.lookup(x) != 0) { + if (size_t diff = newOffset % x->alignment) { + // fix alignment + newOffset += x->alignment - diff; + } + x->offset = newOffset; } - x->offset = bufferStart.lookup(x) + colors.lookup(x) * adj; - bufferStart[x] = x->offset; allocation->sharedMemorySize = std::max(allocation->sharedMemorySize, x->offset + x->size); } diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir index 2107fc754a18..738ad11b344a 100644 --- a/test/Analysis/test-allocation.mlir +++ b/test/Analysis/test-allocation.mlir @@ -200,7 +200,7 @@ tt.func @multi_color(%A : !tt.ptr) { %5 = triton_gpu.local_load %cst_5 : !tt.memdesc<4x8xf16, #A_SHARED> -> tensor<4x8xf16, #AL> // CHECK-NEXT: offset = 1024, size = 512 %cst_6 = triton_gpu.local_alloc : () -> !tt.memdesc<8x32xf16, #A_SHARED> - // CHECK-NEXT: offset = 3104, size = 128 + // CHECK-NEXT: offset = 1792, size = 128 %cst_7 = triton_gpu.local_alloc : () -> !tt.memdesc<2x32xf16, #A_SHARED> %6 = triton_gpu.local_load %cst_0 : !tt.memdesc<4x4xf16, #A_SHARED> -> tensor<4x4xf16, #AL> // CHECK-NEXT: offset = 1024, size = 512 @@ -217,7 +217,7 @@ tt.func @multi_color(%A : !tt.ptr) { %10 = triton_gpu.local_load %cst_7 : !tt.memdesc<2x32xf16, #A_SHARED> -> tensor<2x32xf16, #AL> %cst_12 = arith.constant dense<0.000000e+00> : tensor<4x16xf16, #AL> %cst_13 = arith.constant dense<0.000000e+00> : tensor<8x32xf16, #AL> - // CHECK-NEXT: size = 3232 + // CHECK-NEXT: size = 1920 tt.return } From 9e67626fd83a0fca3579b5dca51c0c190f6d35e6 Mon Sep 17 00:00:00 2001 From: SJW Date: Wed, 8 May 2024 03:53:45 +0000 Subject: [PATCH 2/3] * added BufferT::setOffsetAligned --- include/triton/Analysis/Allocation.h | 9 +++++++++ lib/Analysis/Allocation.cpp | 22 +++++++--------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h index cb71f34319b8..8905032e1390 100644 --- a/include/triton/Analysis/Allocation.h +++ b/include/triton/Analysis/Allocation.h @@ -160,6 +160,15 @@ class Allocation { size_t offset = 0) : kind(kind), id(nextId++), size(size), alignment(alignment), offset(offset) {} + + size_t setOffsetAligned(size_t newOffset) { + if (size_t diff = newOffset % alignment) { + // fix alignment + newOffset += alignment - diff; + } + offset = newOffset; + return offset; + } }; /// Op -> Scratch Buffer diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index 2a4053450a0e..9bc8b69d2404 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -525,7 +525,7 @@ class AllocationAnalysis { SmallVector xBuffers = buffers; while (!xBuffers.empty()) { auto tripleIt = tripleMap.begin(); - auto size = tripleIt->first; + auto offset = tripleIt->first; auto range = tripleIt->second; tripleMap.erase(tripleIt); auto bufferIt = @@ -543,21 +543,18 @@ class AllocationAnalysis { auto xRange = bufferRange.lookup(buffer); // TODO(Keren): A buffer's size shouldn't be determined here, have to // clean it up - size_t offset = size; - if (size_t diff = offset % buffer->alignment) - offset += buffer->alignment - diff; - buffer->offset = offset; + size_t alignOffset = buffer->setOffsetAligned(offset); tripleMap.insert( - {offset + xSize, Interval{std::max(range.start(), xRange.start()), + {alignOffset + xSize, Interval{std::max(range.start(), xRange.start()), std::min(range.end(), xRange.end())}}); // We could either insert (range.start, xRange.start) or (range.start, // xRange.end), both are correct and determine the potential buffer // offset, and the graph coloring algorithm will solve the interference, // if any if (range.start() < xRange.start()) - tripleMap.insert({size, Interval{range.start(), xRange.end()}}); + tripleMap.insert({offset, Interval{range.start(), xRange.end()}}); if (xRange.end() < range.end()) - tripleMap.insert({size, Interval{xRange.start(), range.end()}}); + tripleMap.insert({offset, Interval{xRange.start(), range.end()}}); xBuffers.erase(bufferIt); } } @@ -626,13 +623,8 @@ class AllocationAnalysis { for (auto y : interference.lookup(x)) { newOffset = std::max(newOffset, y->offset + y->size); } - if (colors.lookup(x) != 0) { - if (size_t diff = newOffset % x->alignment) { - // fix alignment - newOffset += x->alignment - diff; - } - x->offset = newOffset; - } + if (colors.lookup(x) != 0) + x->setOffsetAligned(newOffset); allocation->sharedMemorySize = std::max(allocation->sharedMemorySize, x->offset + x->size); } From 31be19f5ba1427f583a51dab0cf28e252ff538eb Mon Sep 17 00:00:00 2001 From: SJW Date: Wed, 8 May 2024 12:46:13 +0000 Subject: [PATCH 3/3] * cleanup and format --- include/triton/Analysis/Allocation.h | 7 +------ lib/Analysis/Allocation.cpp | 6 +++--- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/triton/Analysis/Allocation.h b/include/triton/Analysis/Allocation.h index 8905032e1390..a9e02b420844 100644 --- a/include/triton/Analysis/Allocation.h +++ b/include/triton/Analysis/Allocation.h @@ -162,12 +162,7 @@ class Allocation { offset(offset) {} size_t setOffsetAligned(size_t newOffset) { - if (size_t diff = newOffset % alignment) { - // fix alignment - newOffset += alignment - diff; - } - offset = newOffset; - return offset; + return offset = llvm::alignTo(newOffset, alignment); } }; diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp index 9bc8b69d2404..1e6e38749f4c 100644 --- a/lib/Analysis/Allocation.cpp +++ b/lib/Analysis/Allocation.cpp @@ -544,9 +544,9 @@ class AllocationAnalysis { // TODO(Keren): A buffer's size shouldn't be determined here, have to // clean it up size_t alignOffset = buffer->setOffsetAligned(offset); - tripleMap.insert( - {alignOffset + xSize, Interval{std::max(range.start(), xRange.start()), - std::min(range.end(), xRange.end())}}); + tripleMap.insert({alignOffset + xSize, + Interval{std::max(range.start(), xRange.start()), + std::min(range.end(), xRange.end())}}); // We could either insert (range.start, xRange.start) or (range.start, // xRange.end), both are correct and determine the potential buffer // offset, and the graph coloring algorithm will solve the interference,