From 4173051347387cf14dcbcb6078886494a460ef0a Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sat, 14 Jun 2025 22:21:07 -0700
Subject: [PATCH 01/23] [Backend] Add a PaddedSharedEncodingAttr definition

---
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 101 ++++++++++-
 lib/Dialect/TritonGPU/IR/Dialect.cpp          | 166 +++++++++++++++---
 test/TritonGPU/invalid-attributes.mlir        |  30 ++++
 3 files changed, 263 insertions(+), 34 deletions(-)
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index d97f18785543..a8d3143312d4 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -167,13 +167,14 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
   ];
 }
 
-def SwizzledSharedEncodingAttr :
-  TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
+def SwizzledSharedEncodingAttr
+    : TritonGPU_Attr<"SwizzledSharedEncoding", "swizzled_shared_encoding",
+                     [SharedEncodingTrait, LayoutEncodingTrait]> {
   let mnemonic = "swizzled_shared";
 
   let description = [{
 An encoding for tensors whose elements may be simultaneously accessed by
-different cuda threads in the programs, via shared memory. In other words,
+different GPU threads in the programs, via shared memory. In other words,
 for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
 
 In order to avoid shared memory bank conflicts, elements may be swizzled.
@@ -181,7 +182,7 @@ Here are some examples.  In all cases, the input tensor is [0, 1, ..., n-1].
 
 1. Basic swizzling
 
-  #shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3],  // xor with 0
   [ 5,  4,  7,  6],  // xor with 1
   [10, 11,  8,  9],  // xor with 2
@@ -192,7 +193,7 @@ out[r][c^r]).
 
 2. Multiple rows per phase
 
-  #shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 4,  5,  6,  7],
   [ 9,  8, 11, 10],  // phase 1 (xor with 1)
@@ -203,7 +204,7 @@ means that pairs of 2 rows get the same swizzling.
 
 3. Max-phase applied
 
-  $shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=1, maxPhase=2, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 5,  4,  7,  6],  // phase 1 (xor with 1)
   [ 8,  9, 10, 11],  // phase 0
@@ -218,7 +219,7 @@ effect of limiting the maximum value of the xor to m-1.
 
 4. Max-phase and per-phase
 
-  #shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=1, perPhase=2, maxPhase=2, order=[1,0]}>
   [ 0,  1,  2,  3],  // phase 0 (xor with 0)
   [ 4,  5,  6,  7],  // phase 0
   [ 9,  8, 11, 10],  // phase 1 (xor with 1)
@@ -234,7 +235,7 @@ maximum value of maxPhase-1.  In other words, elements of row r are xor'ed with
 
 5. Adding vec
 
-  #shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
+  #ttg.swizzled_shared<{vec=2, perPhase=1, maxPhase=4, order=[1,0]}>
   [ 0,  1,  2,  3,  4,  5,  6,  7],
   [10, 11,  8,  9, 14, 15, 12, 13],
   [20, 21, 22, 23, 16, 17, 18, 19],
@@ -372,7 +373,7 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
     }]>,
   ];
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
+  let extraClassDeclaration = extraBaseClassDeclaration#[{
     unsigned getRank() const { return getCTAOrder().size(); }
     int32_t getAlignment() const;
     SmallVector<unsigned> getCTAsPerCGA() const;
@@ -383,6 +384,88 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
   let genVerifyDecl = 1;
 }
 
+def PaddeddSharedEncodingAttr
+    : TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
+                     [SharedEncodingTrait, LayoutEncodingTrait]> {
+  let mnemonic = "padded_shared";
+
+  let description = [{
+An encoding for tensors whose elements may be simultaneously accessed by
+different GPU threads in the programs, via shared memory. In other words,
+for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
+Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid
+shared memory bank conflicts.
+
+Formally, given a layout:
+    padded_shared<[<interval_0>:+<pad_0>, <interval_1>:+<pad_1>, ...]>
+We insert a padding of `<pad_i>` elements after every `<interval_i>` elements.
+Multi interval-pad pairs are supported for flexibility of multi tiered padding
+schemes; they compose in an additive manner. So for a 1-D tensor element at
+index i, the corresponding shared memory location index is
+    i + \sum_{k} (i / interval_k) * pad_k = 1
+`<interval_i>` and `<pad_i>` all need to be power of two.
+
+Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
+padding:
+
+1. Single interval-pad pair:
+
+   #ttg.padded_shared<[2:+2]>
+   [e0, e1, p0, p1,
+    e2, e3, p2, p3,
+    ...]
+
+2. Double interval-pad pairs:
+
+   #ttg.padded_shared<[2:+1, 4:+2]>
+   [e0, e1, p0,
+    e2, e3, p1, p2, p3,
+    e4, e5, p4,
+    e6, e7, p5, p6, p7,
+    ...]
+
+In addition to interval-pad pairs, this encoding requires an `order` to
+specify the logical tensor dimenions from the fastest-to slowest-varying.
+It may optionally support CGA level organization like other encoding
+attributes too, for example,
+    #ttg.padded_shared<[2:+1, 4:+2] {
+        order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1],
+        CTAOrder = [0, 1]}>
+  }];
+
+  let parameters = (ins ArrayRefParameter<"unsigned">:$intervals,
+      ArrayRefParameter<"unsigned">:$paddings,
+      // Order of logical tensor dimensions; fastest-varying first.
+      ArrayRefParameter<"unsigned">:$order, "CTALayoutAttr":$CTALayout);
+
+  let builders =
+      [AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
+                       "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout),
+                   [{
+        SmallVector<unsigned> intervals, paddings;
+        intervals.reserve(intervalPads.size());
+        paddings.reserve(intervalPads.size());
+        for (auto [interval, padding] : intervalPads) {
+            intervals.push_back(interval);
+            paddings.push_back(padding);
+        }
+        return get(context, intervals, paddings, order, ctaLayout);
+    }]>,
+  ];
+
+  let extraClassDeclaration = extraBaseClassDeclaration#[{
+    int32_t getAlignment() const { return 16; }
+
+    unsigned getRank() const { return getOrder().size(); }
+
+    SmallVector<unsigned> getCTAsPerCGA() const;
+    SmallVector<unsigned> getCTAOrder() const;
+    SmallVector<unsigned> getCTASplitNum() const;
+  }];
+  let hasCustomAssemblyFormat = 1;
+  let genVerifyDecl = 1;
+}
+
 def NVMMASharedEncodingAttr :
   TritonGPU_Attr<"NVMMASharedEncoding", "nvmma_shared_encoding", [SharedEncodingTrait, LayoutEncodingTrait]> {
   let mnemonic = "nvmma_shared";
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 90f0f502e1ae..c85d1783fcd4 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -5,6 +5,7 @@
 
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Interfaces.h"
@@ -20,7 +21,9 @@
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "triton/Tools/Sys/GetEnv.hpp"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/MathExtras.h"
 
 // Include TableGen'erated code
 #include "triton/Dialect/TritonGPU/IR/Dialect.cpp.inc"
@@ -641,6 +644,16 @@ SmallVector<unsigned> SwizzledSharedEncodingAttr::getCTASplitNum() const {
   return SmallVector<unsigned>(getCTALayout().getCTASplitNum());
 }
 
+SmallVector<unsigned> PaddedSharedEncodingAttr::getCTAsPerCGA() const {
+  return llvm::to_vector(getCTALayout().getCTAsPerCGA());
+}
+SmallVector<unsigned> PaddedSharedEncodingAttr::getCTAOrder() const {
+  return llvm::to_vector(getCTALayout().getCTAOrder());
+}
+SmallVector<unsigned> PaddedSharedEncodingAttr::getCTASplitNum() const {
+  return llvm::to_vector(getCTALayout().getCTASplitNum());
+}
+
 int32_t AMDRotatingSharedEncodingAttr::getAlignment() const { return 16; }
 
 SmallVector<unsigned> AMDRotatingSharedEncodingAttr::getCTAsPerCGA() const {
@@ -1492,6 +1505,35 @@ void SliceEncodingAttr::print(mlir::AsmPrinter &printer) const {
 // Helper shared encoding functions
 //===----------------------------------------------------------------------===//
 
+std::optional<CTALayoutAttr>
+parseCTAAttrs(AsmParser &parser, NamedAttrList attrList, unsigned rank) {
+  std::optional<SmallVector<unsigned>> CTAsPerCGA;
+  std::optional<SmallVector<unsigned>> CTASplitNum;
+  std::optional<SmallVector<unsigned>> CTAOrder;
+
+  for (const NamedAttribute &attr : attrList) {
+    if (attr.getName() == "CTAsPerCGA") {
+      if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA")
+              .failed())
+        return {};
+    } else if (attr.getName() == "CTASplitNum") {
+      if (parseIntArrayAttr(parser, attr, CTASplitNum.emplace(), "CTASplitNum")
+              .failed())
+        return {};
+    } else if (attr.getName() == "CTAOrder") {
+      if (parseIntArrayAttr(parser, attr, CTAOrder.emplace(), "CTAOrder")
+              .failed())
+        return {};
+    } else {
+      parser.emitError(parser.getNameLoc(), "unexpected key: ")
+          << attr.getName().strref();
+      return {};
+    }
+  }
+
+  return getCTALayoutOrError(parser, CTAsPerCGA, CTASplitNum, CTAOrder, rank);
+}
+
 template <typename SpecificEncoding>
 Attribute parseSwizzledEncoding(AsmParser &parser, Type type) {
   if (parser.parseLess().failed())
@@ -1507,9 +1549,7 @@ Attribute parseSwizzledEncoding(AsmParser &parser, Type type) {
   unsigned perPhase = 0;
   unsigned maxPhase = 0;
   SmallVector<unsigned> order;
-  std::optional<SmallVector<unsigned>> CTAsPerCGA;
-  std::optional<SmallVector<unsigned>> CTASplitNum;
-  std::optional<SmallVector<unsigned>> CTAOrder;
+  NamedAttrList remainingAttrs;
   for (const NamedAttribute &attr : dict) {
     if (attr.getName() == "vec") {
       if (parseUInt(parser, attr, vec, "vec").failed())
@@ -1523,32 +1563,15 @@ Attribute parseSwizzledEncoding(AsmParser &parser, Type type) {
     } else if (attr.getName() == "order") {
       if (parseIntArrayAttr(parser, attr, order, "order").failed())
         return {};
-    } else if (attr.getName() == "CTAsPerCGA") {
-      if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA")
-              .failed())
-        return {};
-    } else if (attr.getName() == "CTASplitNum") {
-      if (parseIntArrayAttr(parser, attr, CTASplitNum.emplace(), "CTASplitNum")
-              .failed())
-        return {};
-    } else if (attr.getName() == "CTAOrder") {
-      if (parseIntArrayAttr(parser, attr, CTAOrder.emplace(), "CTAOrder")
-              .failed())
-        return {};
     } else {
-      parser.emitError(parser.getNameLoc(), "unexpected key: ")
-          << attr.getName().strref();
-      return {};
+      remainingAttrs.push_back(attr);
     }
   }
 
-  std::optional<CTALayoutAttr> CTALayout = getCTALayoutOrError(
-      parser, CTAsPerCGA, CTASplitNum, CTAOrder, /*rank=*/order.size());
-  if (!CTALayout.has_value())
-    return {};
-
-  return parser.getChecked<SpecificEncoding>(parser.getContext(), vec, perPhase,
-                                             maxPhase, order, *CTALayout);
+  if (auto CTALayout = parseCTAAttrs(parser, remainingAttrs, order.size()))
+    return parser.getChecked<SpecificEncoding>(
+        parser.getContext(), vec, perPhase, maxPhase, order, *CTALayout);
+  return {};
 }
 
 //===----------------------------------------------------------------------===//
@@ -1583,6 +1606,99 @@ void SwizzledSharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "}>";
 }
 
+//===----------------------------------------------------------------------===//
+// PaddedShared encoding
+//===----------------------------------------------------------------------===//
+
+Attribute PaddedSharedEncodingAttr::parse(AsmParser &parser, Type type) {
+  // <[
+  if (failed(parser.parseLess()) || failed(parser.parseLSquare()))
+    return {};
+
+  // <interval_i>:+<padding_i>
+  SmallVector<unsigned, 4> intervals, paddings;
+  auto parseIntervalPaddingPair = [&]() {
+    unsigned interval = 0, padding = 0;
+    if (failed(parser.parseInteger(interval)) || failed(parser.parseColon()) ||
+        failed(parser.parsePlus()) || failed(parser.parseInteger(padding)))
+      return failure();
+    intervals.push_back(interval);
+    paddings.push_back(padding);
+    return success();
+  };
+  // ]
+  if (failed(parser.parseCommaSeparatedList(parseIntervalPaddingPair)) ||
+      failed(parser.parseRSquare()))
+    return {};
+
+  // {<attr-dict>}>
+  NamedAttrList attrList;
+  if (failed(parser.parseOptionalAttrDict(attrList)) ||
+      failed(parser.parseGreater()))
+    return {};
+
+  // Decode order and CTA attributes
+  SmallVector<unsigned> order;
+  NamedAttrList remainingAttrs;
+  for (const NamedAttribute &attr : attrList) {
+    if (attr.getName() == "order") {
+      if (parseIntArrayAttr(parser, attr, order, "order").failed())
+        return {};
+    } else {
+      remainingAttrs.push_back(attr);
+    }
+  }
+  if (auto ctaLayout = parseCTAAttrs(parser, remainingAttrs, order.size()))
+    return parser.getChecked<PaddedSharedEncodingAttr>(
+        parser.getContext(), intervals, paddings, order, *ctaLayout);
+  return {};
+}
+
+void PaddedSharedEncodingAttr::print(AsmPrinter &printer) const {
+  printer << "<[";
+  llvm::interleaveComma(llvm::zip(getIntervals(), getPaddings()), printer,
+                        [&](std::tuple<unsigned, unsigned> intervalPad) {
+                          printer << std::get<0>(intervalPad) << ":+"
+                                  << std::get<1>(intervalPad);
+                        });
+  printer << "] {order = [" << getOrder() << "]";
+  maybePrintCTALayout(getContext(), printer, getCTALayout(),
+                      /*rank=*/getOrder().size());
+  printer << "}>";
+}
+
+LogicalResult PaddedSharedEncodingAttr::verify(
+    function_ref<InFlightDiagnostic()> emitError, ArrayRef<unsigned> intervals,
+    ArrayRef<unsigned> paddings, ArrayRef<unsigned> order,
+    CTALayoutAttr ctaLayout) {
+  if (intervals.size() != paddings.size())
+    return emitError() << "intervals size (" << intervals.size()
+                       << ") must match paddings size (" << paddings.size()
+                       << ")";
+
+  if (intervals.empty())
+    return emitError() << "must have at least one interval-padding pair";
+
+  if (!llvm::all_of(intervals, llvm::isPowerOf2_32))
+    return emitError() << "interval values must all be power of two";
+  if (!llvm::all_of(paddings, llvm::isPowerOf2_32))
+    return emitError() << "padding values must all be power of two";
+
+  llvm::SmallSet<unsigned, 4> intervalValues(intervals.begin(),
+                                             intervals.end());
+  if (intervalValues.size() != intervals.size())
+    return emitError() << "interval values cannot have duplicates";
+
+  if (order.empty())
+    return emitError() << "order cannot be empty";
+
+  if (order.size() != ctaLayout.getRank())
+    return emitError() << "order size (" << order.size()
+                       << ") must match CTALayout rank (" << ctaLayout.getRank()
+                       << ")";
+  return verifyLayoutOrder(emitError, order);
+}
+
 //===----------------------------------------------------------------------===//
 // NVMMAShared encoding
 //===----------------------------------------------------------------------===//
diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir
index df693a6ea81c..3a2aac907096 100644
--- a/test/TritonGPU/invalid-attributes.mlir
+++ b/test/TritonGPU/invalid-attributes.mlir
@@ -76,3 +76,33 @@
 
 // expected-error@+1 {{(M, N) cases other than (32, 32) or (16, 16) unimplemented}}
 #mfma = #ttg.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1, 1], instrShape = [16, 8], isTransposed = false}>
+
+// -----
+
+// expected-error@+1 {{interval values must all be power of two}}
+#shared = #ttg.padded_shared<[3:+2]>
+
+// -----
+
+// expected-error@+1 {{padding values must all be power of two}}
+#shared = #ttg.padded_shared<[2:+3]>
+
+// -----
+
+// expected-error@+1 {{interval values cannot have duplicates}}
+#shared = #ttg.padded_shared<[2:+1, 2:+4]>
+
+// -----
+
+// expected-error@+1 {{order cannot be empty}}
+#shared = #ttg.padded_shared<[2:+1, 4:+2]>
+
+// -----
+
+// expected-error@+1 {{unexpected key: unknown}}
+#shared = #ttg.padded_shared<[2:+1, 4:+2] {order = [1, 0], unknown = 5}>
+
+// -----
+
+// expected-error@+1 {{order size (3) must match CTALayout rank (2)}}
+#shared = #ttg.padded_shared<[2:+1, 4:+2] {order = [2, 1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1]}>

From 8e8bb8817e0737bf21b1e3509d7044bb1b42dc06 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sun, 15 Jun 2025 11:31:50 -0700
Subject: [PATCH 02/23] Support PaddedSharedEncodingAttr in LLVM lowering

---
 .../Conversion/TritonGPUToLLVM/Utility.h      |  6 ---
 .../TritonGPU/IR/LinearLayoutConversions.h    |  2 +
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |  3 +-
 include/triton/Tools/LinearLayout.h           | 14 ++++++
 lib/Conversion/TritonGPUToLLVM/Utility.cpp    | 45 +++++++++++--------
 .../TritonGPU/IR/LinearLayoutConversions.cpp  | 24 +++++++---
 lib/Tools/LinearLayout.cpp                    | 16 ++++++-
 .../SharedToDotOperandWMMA.cpp                |  5 ++-
 .../lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp |  3 +-
 9 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
index 04ca702fc932..fc8ec9d3efd1 100644
--- a/include/triton/Conversion/TritonGPUToLLVM/Utility.h
+++ b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -537,12 +537,6 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
     const TargetInfoBase &target,
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
 
-[[nodiscard]] bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback);
-
 [[nodiscard]] bool emitTransferBetweenRegistersAndShared(
     LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
     std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index e458d425be39..8bec87dd57be 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -49,6 +49,8 @@ class AMDMfmaEncodingAttr;
 // Returns std::nullopt if the given layout can't be converted to an LL.
 LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 
+PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape, Attribute layout);
+
 // Convert the shared encoding of a tensor with `nvmma_shared` layout to a
 // LinearLayout that maps from a linear shared memory offset to tensor index.
 //
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index a8d3143312d4..a436cb4a0e61 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -454,9 +454,10 @@ attributes too, for example,
   ];
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
+    unsigned getRank() const { return getOrder().size(); }
     int32_t getAlignment() const { return 16; }
 
-    unsigned getRank() const { return getOrder().size(); }
+    PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape) const;
 
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index 30735db0a308..6359975c2f03 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -840,6 +840,20 @@ class ColumnAction {
   std::string toString() const;
 };
 
+class PaddedLayout {
+public:
+  PaddedLayout(LinearLayout linearMapping, ArrayRef<unsigned> intervals,
+               ArrayRef<unsigned> paddings);
+
+  const LinearLayout &getLinearMapping() const { return linearMapping; }
+
+  std::optional<int32_t> getMinInterval() const;
+
+private:
+  LinearLayout linearMapping;
+  SmallVector<std::pair<unsigned, unsigned>> intervalPads;
+};
+
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LINEARLAYOUT_H
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 6323bbff6047..24db4e9c5c33 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -8,6 +8,7 @@
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MathExtras.h"
 
 #if defined(_MSC_VER) && !defined(__clang__)
 // from https://gist.github.com/pps83/3210a2f980fd02bb2ba2e5a1fc4a2ef0
@@ -469,6 +470,17 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
     Value baseToAllocBaseDist = dot(rewriter, loc, smemOffsets, smemStrides);
     smemOffset = b.sub(smemOffset, baseToAllocBaseDist);
   }
+  if (auto paddedLayout =
+          dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc)) {
+    Value padOffset = b.i32_val(0);
+    for (auto [interval, padding] : llvm::zip_equal(
+             paddedLayout.getIntervals(), paddedLayout.getPaddings())) {
+      Value iVal = b.i32_val(llvm::Log2_32(interval));
+      Value pVal = b.i32_val(llvm::Log2_32(padding));
+      padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal));
+    }
+    smemOffset = b.add(smemOffset, padOffset);
+  }
   auto ptrTy = smemBase.getType();
   auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset,
                        LLVM::GEPNoWrapFlags::inbounds);
@@ -477,17 +489,6 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
 
 } // namespace
 
-bool emitTransferBetweenRegistersAndShared(
-    LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
-    std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
-    Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-    std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
-  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
-  return emitTransferBetweenRegistersAndShared(
-      regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
-      target, laneId, warpId, perVectorCallback);
-}
-
 bool emitTransferBetweenRegistersAndShared(
     LinearLayout &regLayout, triton::gpu::MemDescType sharedTy, Type elemLlvmTy,
     std::optional<int32_t> maxVecElems, const SharedMemoryObject &smemObj,
@@ -503,9 +504,10 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  LinearLayout sharedLayout =
-      triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
-  LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
+  PaddedLayout sharedLayout =
+      triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding());
+  LinearLayout regToSharedLayout =
+      regLayout.invertAndCompose(sharedLayout.getLinearMapping());
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
@@ -530,9 +532,10 @@ bool emitTransferBetweenRegistersAndShared(
   //
   // It's OK if the vector width we choose here is wider than the hardware
   // supports; LLVM will legalize it.
-  const int vecElems =
-      std::min(regToSharedLayout.getNumConsecutiveInOut(),
-               maxVecElems.value_or(std::numeric_limits<int>::max()));
+  const int vecElems = std::min(
+      {regToSharedLayout.getNumConsecutiveInOut(),
+       sharedLayout.getMinInterval().value_or(std::numeric_limits<int>::max()),
+       maxVecElems.value_or(std::numeric_limits<int>::max())});
 
   auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
   Value blockId =
@@ -572,9 +575,10 @@ bool emitTransferBetweenRegistersAndShared(
     std::function<void(VectorType, Value /*shmemAddr*/)> perVectorCallback) {
   auto regLayout = triton::gpu::toLinearLayout(registerTy.getShape(),
                                                registerTy.getEncoding());
+  auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   return emitTransferBetweenRegistersAndShared(
       regLayout, sharedTy, elemLlvmTy, maxVecElems, smemObj, loc, rewriter,
-      target, perVectorCallback);
+      target, laneId, warpId, perVectorCallback);
 }
 
 SmallVector<Value> loadSharedToDistributed(triton::gpu::LocalLoadOp localLoadOp,
@@ -762,10 +766,13 @@ bool isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
                                 ArrayRef<int64_t> allocShape,
                                 triton::gpu::SharedEncodingTrait sharedEnc) {
   auto rank = shape.size();
+  auto paddedLayout =
+      dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc);
   auto swizzledLayout =
       dyn_cast<triton::gpu::SwizzledSharedEncodingAttr>(sharedEnc);
   auto nvmmaLayout = dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(sharedEnc);
-  bool noSwizzling = (swizzledLayout && swizzledLayout.getMaxPhase() == 1) ||
+  bool noSwizzling = paddedLayout ||
+                     (swizzledLayout && swizzledLayout.getMaxPhase() == 1) ||
                      (nvmmaLayout && nvmmaLayout.getSwizzlingByteWidth() == 0);
   return /*no swizzling*/ noSwizzling ||
          /*swizzling but same shape*/ shape == allocShape ||
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 0ac56a8a78ef..0739496baa03 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1,7 +1,5 @@
 #include <vector>
 
-#include "triton/Dialect/Triton/IR/Dialect.h"
-#include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
@@ -11,13 +9,10 @@
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 
-using mlir::triton::ScaleDotElemType;
-
 namespace mlir::triton::gpu {
 namespace {
 
@@ -274,6 +269,15 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
 
 } // namespace
 
+PaddedLayout
+PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef<int64_t> shape) const {
+  auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
+      getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
+      getCTALayout());
+  LinearLayout ll = swizzledSharedToLinearLayout(shape, nonSwizzleAttr);
+  return PaddedLayout(ll, getIntervals(), getPaddings());
+}
+
 LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
                                        NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle) {
@@ -1125,6 +1129,16 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout) {
                                                                    layout);
 }
 
+PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape, Attribute layout) {
+  auto *ctx = layout.getContext();
+  if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout)) {
+    return paddedLayout.toPaddedLayout(shape);
+  }
+  auto ll =
+      ctx->getLoadedDialect<TritonGPUDialect>()->toLinearLayout(shape, layout);
+  return PaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{});
+}
+
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout) {
   assert(!layout.getInDimNames().empty());
   MLIRContext *ctx = layout.getInDimNames().begin()->getContext();
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index eea4b0f6bf56..0958e3292e4a 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -8,7 +8,6 @@
 #include "third_party/f2reduce/f2reduce.h"
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/StrUtil.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/StringRef.h"
@@ -1338,4 +1337,19 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
+PaddedLayout::PaddedLayout(LinearLayout linearMapping,
+                           ArrayRef<unsigned> intervals,
+                           ArrayRef<unsigned> paddings)
+    : linearMapping(std::move(linearMapping)) {
+  intervalPads.reserve(intervals.size());
+  for (auto [i, p] : llvm::zip_equal(intervals, paddings))
+    intervalPads.emplace_back(i, p);
+}
+
+std::optional<int32_t> PaddedLayout::getMinInterval() const {
+  if (intervalPads.empty())
+    return std::nullopt;
+  return *llvm::min_element(llvm::make_first_range(intervalPads));
+}
+
 } // namespace mlir::triton
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
index 221d3b849d1b..77fc628b446e 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandWMMA.cpp
@@ -154,7 +154,10 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
 
   auto aTensorTy = cast<triton::gpu::MemDescType>(tensor.getType());
   ArrayRef<int64_t> shape = aTensorTy.getShape();
-  auto sharedLayout = cast<SwizzledSharedEncodingAttr>(aTensorTy.getEncoding());
+  auto sharedLayout =
+      dyn_cast<SwizzledSharedEncodingAttr>(aTensorTy.getEncoding());
+  if (!sharedLayout)
+    return Value();
   auto order = sharedLayout.getOrder();
 
   // Rely on the linear layout conversion logic in this case, since only slowest
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
index dcc70aa64198..a0c4f2083b19 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
@@ -271,10 +271,11 @@ struct TransLocalLoadOpConversion
     SmallVector<Value> outVals;
     SmallVector<Value> elemsI32;
     mlir::Type retTy = dstTy;
+    auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
     bool valid = emitTransferBetweenRegistersAndShared(
         ldsTransLayout, srcTy, llvmElemTy,
         /*maxVecElems=*/std::nullopt, smemObj, loc, rewriter, targetInfo,
-        [&](VectorType vecTy, Value vecAddr) {
+        laneId, warpId, [&](VectorType vecTy, Value vecAddr) {
           if (bitwidth == 16) {
             auto dsReadOp =
                 rewriter.create<ROCDL::ds_read_tr16_b64>(loc, vecTy, vecAddr);

From ddebf3cd01f5f97785ec78b5c86a34797fa3b09f Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Tue, 17 Jun 2025 23:47:21 +0000
Subject: [PATCH 03/23] Add new padded shared layout attr builder

---
 .../triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index a436cb4a0e61..c5abb281930c 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -450,7 +450,15 @@ attributes too, for example,
             paddings.push_back(padding);
         }
         return get(context, intervals, paddings, order, ctaLayout);
-    }]>,
+      }]>,
+       AttrBuilder<(ins "ArrayRef<int64_t>":$shape, "ArrayRef<unsigned>":$order,
+                       "unsigned":$dotKWidth, "unsigned":$elemBitWidth,
+                       "CTALayoutAttr":$ctaLayout),
+                   [{
+        unsigned inner = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]];
+        unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u);
+        return get(context, {{inner, threadNumBytes}}, order, ctaLayout);
+      }]>,
   ];
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{

From 03b802a6f5df66da068074053ae699c328f1842c Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Mon, 16 Jun 2025 03:18:37 +0000
Subject: [PATCH 04/23] Fix LLVM lowering issues

---
 include/triton/Tools/LinearLayout.h        |  2 ++
 lib/Conversion/TritonGPUToLLVM/Utility.cpp | 17 ++++++++++-------
 lib/Dialect/TritonGPU/IR/Dialect.cpp       | 11 ++++++-----
 lib/Tools/LinearLayout.cpp                 |  6 ++++++
 4 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index 6359975c2f03..d62107c2d071 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -849,6 +849,8 @@ class PaddedLayout {
 
   std::optional<int32_t> getMinInterval() const;
 
+  bool hasNoPadding() const;
+
 private:
   LinearLayout linearMapping;
   SmallVector<std::pair<unsigned, unsigned>> intervalPads;
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 24db4e9c5c33..4b180958f478 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -504,10 +504,10 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  PaddedLayout sharedLayout =
+  PaddedLayout paddedLayout =
       triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
-      regLayout.invertAndCompose(sharedLayout.getLinearMapping());
+      regLayout.invertAndCompose(paddedLayout.getLinearMapping());
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
@@ -534,7 +534,7 @@ bool emitTransferBetweenRegistersAndShared(
   // supports; LLVM will legalize it.
   const int vecElems = std::min(
       {regToSharedLayout.getNumConsecutiveInOut(),
-       sharedLayout.getMinInterval().value_or(std::numeric_limits<int>::max()),
+       paddedLayout.getMinInterval().value_or(std::numeric_limits<int>::max()),
        maxVecElems.value_or(std::numeric_limits<int>::max())});
 
   auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
@@ -549,10 +549,13 @@ bool emitTransferBetweenRegistersAndShared(
   // take out the "block" dimension.
   // Thus we use `pseudoinvert` instead of `invert` here for simplicity.
   auto allocShape = sharedTy.getAllocShape();
-  LinearLayout invertAllocSharedLayout =
-      triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
-                                  sharedTy.getEncoding())
-          .pseudoinvert();
+  auto invertAllocSharedLayout = LinearLayout::empty();
+  if (paddedLayout.hasNoPadding()) {
+    invertAllocSharedLayout =
+        triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
+                                    sharedTy.getEncoding())
+            .pseudoinvert();
+  }
 
   int numElems = regToSharedLayout.getInDimSize(kRegister);
   auto vecTy = vec_ty(elemLlvmTy, vecElems);
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index c85d1783fcd4..aceefd55078e 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -176,18 +176,19 @@ SmallVector<unsigned> getRepOrder(RankedTensorType type) {
 // This one's not terribly bad as we don't broadcast ShareEncodings
 SmallVector<unsigned> getOrder(SharedEncodingTrait layout,
                                ArrayRef<int64_t> shape) {
-  if (auto swizzledLayout =
-          mlir::dyn_cast<SwizzledSharedEncodingAttr>(layout)) {
+  if (auto swizzledLayout = dyn_cast<SwizzledSharedEncodingAttr>(layout)) {
     return llvm::to_vector(swizzledLayout.getOrder());
   }
-  if (auto sharedLayout = mlir::dyn_cast<NVMMASharedEncodingAttr>(layout)) {
+  if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout)) {
+    return llvm::to_vector(paddedLayout.getOrder());
+  }
+  if (auto sharedLayout = dyn_cast<NVMMASharedEncodingAttr>(layout)) {
     if (shape.size() == 1) {
       return {0};
     }
     return getMatrixOrder(shape.size(), !sharedLayout.getTransposed());
   }
-  if (auto sharedLayout =
-          mlir::dyn_cast<AMDRotatingSharedEncodingAttr>(layout)) {
+  if (auto sharedLayout = dyn_cast<AMDRotatingSharedEncodingAttr>(layout)) {
     return llvm::to_vector(sharedLayout.getOrder());
   }
   llvm::report_fatal_error("Unimplemented usage of getOrder for MemDescType");
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 0958e3292e4a..214407c6d628 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1352,4 +1352,10 @@ std::optional<int32_t> PaddedLayout::getMinInterval() const {
   return *llvm::min_element(llvm::make_first_range(intervalPads));
 }
 
+bool PaddedLayout::hasNoPadding() const {
+  return intervalPads.empty() ||
+         llvm::all_of(llvm::make_second_range(intervalPads),
+                      [](unsigned v) { return v == 0; });
+}
+
 } // namespace mlir::triton

From 3bdcc7eed39cd6ddb2d0d6c76566f23fd0081463 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Tue, 17 Jun 2025 18:27:36 +0000
Subject: [PATCH 05/23] Fix more llvm lowering issues

---
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |  4 ++++
 lib/Analysis/Allocation.cpp                   | 16 +++++++++-----
 lib/Dialect/TritonGPU/IR/Dialect.cpp          | 22 ++++++++++++++++++-
 .../TritonGPU/IR/LinearLayoutConversions.cpp  |  9 --------
 4 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index c5abb281930c..949b46b42751 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -467,6 +467,10 @@ attributes too, for example,
 
     PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape) const;
 
+    // Returns the total number of elements including padding given the input
+    // tensor shape.
+    int64_t getPaddedSize(ArrayRef<int64_t> shape) const;
+
     SmallVector<unsigned> getCTAsPerCGA() const;
     SmallVector<unsigned> getCTAOrder() const;
     SmallVector<unsigned> getCTASplitNum() const;
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
index 7b897aaacc11..9dbe0c87108d 100644
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -260,12 +260,18 @@ class AllocationAnalysis {
     auto alloc = dyn_cast<gpu::LocalAllocOp>(op);
     if (!alloc || !alloc.isSharedMemoryAlloc())
       return;
-    // Bytes could be a different value once we support padding or other
-    // allocation policies.
     auto allocType = alloc.getType();
-    auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
-    auto bytes =
-        product<int64_t>(shapePerCTA) * allocType.getElementTypeBitWidth() / 8;
+    int64_t bytes = 0;
+    if (auto paddedLayout =
+            dyn_cast<gpu::PaddedSharedEncodingAttr>(allocType.getEncoding())) {
+      SmallVector<int64_t> unpaddedShape = gpu::getShapePerCTA(allocType);
+      bytes = paddedLayout.getPaddedSize(unpaddedShape) *
+              allocType.getElementTypeBitWidth() / 8;
+    } else {
+      auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
+      bytes = product<int64_t>(shapePerCTA) *
+              allocType.getElementTypeBitWidth() / 8;
+    }
 
     auto alignment = alloc.getAlignmentOrDefault();
     allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index aceefd55078e..0c46393930e8 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -316,7 +316,7 @@ SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape) {
 SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
                                               ArrayRef<int64_t> shapeLogical) {
   SmallVector<int64_t> shape(shapeLogical);
-  if (auto sharedMMALayout = mlir::dyn_cast<NVMMASharedEncodingAttr>(layout)) {
+  if (auto sharedMMALayout = dyn_cast<NVMMASharedEncodingAttr>(layout)) {
     if (sharedMMALayout.getFp4Padded()) {
       auto packedAxis = getOrder(sharedMMALayout, shapeLogical)[0];
       shape[packedAxis] *= 2;
@@ -1700,6 +1700,26 @@ LogicalResult PaddedSharedEncodingAttr::verify(
   return verifyLayoutOrder(emitError, order);
 }
 
+PaddedLayout
+PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef<int64_t> shape) const {
+  auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
+      getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
+      getCTALayout());
+  LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr);
+  return PaddedLayout(ll, getIntervals(), getPaddings());
+}
+
+int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
+  int64_t unpaddedSize = product(shape);
+  int64_t paddingSize = 0;
+  for (auto [interval, padding] :
+       llvm::zip_equal(getIntervals(), getPaddings())) {
+    paddingSize += (unpaddedSize >> llvm::Log2_32(interval))
+                   << llvm::Log2_32(padding);
+  }
+  return unpaddedSize + paddingSize;
+}
+
 //===----------------------------------------------------------------------===//
 // NVMMAShared encoding
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 0739496baa03..1102f2881ac2 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -269,15 +269,6 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
 
 } // namespace
 
-PaddedLayout
-PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef<int64_t> shape) const {
-  auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
-      getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
-      getCTALayout());
-  LinearLayout ll = swizzledSharedToLinearLayout(shape, nonSwizzleAttr);
-  return PaddedLayout(ll, getIntervals(), getPaddings());
-}
-
 LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
                                        NVMMASharedEncodingAttr shared,
                                        bool disableSwizzle) {

From ae67bba58372728e24773a3a214f0f88066a6765 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Tue, 17 Jun 2025 22:57:06 +0000
Subject: [PATCH 06/23] Add allocation tests

---
 test/Analysis/test-allocation.mlir | 63 ++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
index 3400039ed352..27739e0e561d 100644
--- a/test/Analysis/test-allocation.mlir
+++ b/test/Analysis/test-allocation.mlir
@@ -21,6 +21,9 @@
 #NVMMA_SHARED_64 = #ttg.nvmma_shared<{swizzlingByteWidth = 64, transposed = false, elementBitWidth = 16}>
 #NVMMA_SHARED_128 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
 #NVMMA_SHARED_FP4PADDED = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 8, fp4Padded = true}>
+#PADDED_SHARED_0 = #ttg.padded_shared<[256:+8] {order = [1, 0]}>
+#PADDED_SHARED_1 = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}>
+#PADDED_SHARED_2 = #ttg.padded_shared<[64:+2, 128:+4, 256:+8] {order = [1, 0]}>
 
 #smem = #ttg.shared_memory
 
@@ -937,4 +940,64 @@ tt.func @nvmma_alignment(%lb : index, %ub : index, %step : index, %A : !tt.ptr<f
   tt.return
 }
 
+
+// expected-remark @below {{padded_shared_layout_size}}
+// expected-remark @below {{size = 1058}}
+tt.func @padded_shared_layout_size() {
+  // expected-remark @+2 {{offset = 0, size = 510}}
+  // 255 * 2B = 510B
+  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<1x255xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 528}}
+  // (256 + 8) * 2B = 528B
+  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<1x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 530}}
+  // (257 + 8) * 2B = 530B
+  %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<1x257xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 1038}}
+  // (511 + 8) * 2B = 1038B
+  %alloc3 = ttg.local_alloc : () -> !ttg.memdesc<1x511xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 1056}}
+  // (512 + 8 * 2) * 2B = 1056B
+  %alloc4 = ttg.local_alloc : () -> !ttg.memdesc<1x512xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 1058}}
+  // (513 + 8 * 2) * 2B = 1058B
+  %alloc5 = ttg.local_alloc : () -> !ttg.memdesc<1x513xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 528}}
+  // (16 * 16 + 8) * 2B = 528B
+  %alloc6 = ttg.local_alloc : () -> !ttg.memdesc<16x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 1056}}
+  // (16 * 32 + 8 * 2) * 2B = 1056B
+  %alloc7 = ttg.local_alloc : () -> !ttg.memdesc<16x32xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 1008}}
+  // (31 * 16 + 8) * 2B = 1008B
+  %alloc8 = ttg.local_alloc : () -> !ttg.memdesc<31x16xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  tt.return
+}
+
+// expected-remark @below {{padded_shared_layout_element_type}}
+// expected-remark @below {{size = 16896}}
+tt.func @padded_shared_layout_element_type() {
+  // expected-remark @+2 {{offset = 0, size = 4224}}
+  // (16 * 256 + 8 * 16) * 1B = 4224B
+  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 8448}}
+  // (16 * 256 + 8 * 16) * 2B = 8448B
+  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf16, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 16896}}
+  // (16 * 256 + 8 * 16) * 4B = 16896B
+  %alloc2 = ttg.local_alloc : () -> !ttg.memdesc<16x256xf32, #PADDED_SHARED_0, #ttg.shared_memory, mutable>
+  tt.return
+}
+
+// expected-remark @below {{padded_shared_layout_multi_tier}}
+// expected-remark @below {{size = 4480}}
+tt.func @padded_shared_layout_multi_tier() {
+  // expected-remark @+2 {{offset = 0, size = 4352}}
+  // (16 * 256 + 4 * 32 + 8 * 16) * 1B = 4352B
+  %alloc0 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_1, #ttg.shared_memory, mutable>
+  // expected-remark @+2 {{offset = 0, size = 4480}}
+  // (16 * 256 + 2 * 64 + 4 * 32 + 8 * 16) * 1B = 4480B
+  %alloc1 = ttg.local_alloc : () -> !ttg.memdesc<16x256xi8, #PADDED_SHARED_2, #ttg.shared_memory, mutable>
+  tt.return
+}
 }

From fbb041e559c6fc85c34cb07fcf161a7f1e88fa1c Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Wed, 18 Jun 2025 00:46:30 +0000
Subject: [PATCH 07/23] Fix a bunch of small issues

---
 .../TritonGPU/IR/LinearLayoutConversions.h    |  7 ++--
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 18 +++++-----
 include/triton/Tools/LinearLayout.h           | 26 ++++++++++----
 lib/Analysis/Allocation.cpp                   |  9 +++--
 lib/Conversion/TritonGPUToLLVM/Utility.cpp    | 36 +++++++++++--------
 lib/Dialect/TritonGPU/IR/Dialect.cpp          |  6 ++--
 .../TritonGPU/IR/LinearLayoutConversions.cpp  | 14 ++++----
 lib/Tools/LinearLayout.cpp                    | 16 ++++-----
 8 files changed, 74 insertions(+), 58 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index 8bec87dd57be..d7ef07bc5bff 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -49,8 +49,6 @@ class AMDMfmaEncodingAttr;
 // Returns std::nullopt if the given layout can't be converted to an LL.
 LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 
-PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape, Attribute layout);
-
 // Convert the shared encoding of a tensor with `nvmma_shared` layout to a
 // LinearLayout that maps from a linear shared memory offset to tensor index.
 //
@@ -300,5 +298,10 @@ LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
 // the two can be done using transferWithinWarp, without involving LDS
 std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 
+// Convert the given layout to a linear layout with potential additional
+// physical memory paddings.
+PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
+                                        Attribute layout);
+
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index 949b46b42751..335ced1f857d 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -373,7 +373,7 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
     }]>,
   ];
 
-  let extraClassDeclaration = extraBaseClassDeclaration#[{
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
     unsigned getRank() const { return getCTAOrder().size(); }
     int32_t getAlignment() const;
     SmallVector<unsigned> getCTAsPerCGA() const;
@@ -399,23 +399,23 @@ shared memory bank conflicts.
 Formally, given a layout:
     padded_shared<[<interval_0>:+<pad_0>, <interval_1>:+<pad_1>, ...]>
 We insert a padding of `<pad_i>` elements after every `<interval_i>` elements.
-Multi interval-pad pairs are supported for flexibility of multi tiered padding
-schemes; they compose in an additive manner. So for a 1-D tensor element at
-index i, the corresponding shared memory location index is
+Multi interval-padding pairs are supported for flexibility of multi tiered
+padding schemes; they compose in an additive manner. So for a 1-D tensor element
+at index i, the corresponding shared memory location index is
     i + \sum_{k} (i / interval_k) * pad_k = 1
 `<interval_i>` and `<pad_i>` all need to be power of two.
 
 Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
 padding:
 
-1. Single interval-pad pair:
+1. Single interval-padding pair:
 
    #ttg.padded_shared<[2:+2]>
    [e0, e1, p0, p1,
     e2, e3, p2, p3,
     ...]
 
-2. Double interval-pad pairs:
+2. Double interval-padding pairs:
 
    #ttg.padded_shared<[2:+1, 4:+2]>
    [e0, e1, p0,
@@ -424,7 +424,7 @@ padding:
     e6, e7, p5, p6, p7,
     ...]
 
-In addition to interval-pad pairs, this encoding requires an `order` to
+In addition to interval-padding pairs, this encoding requires an `order` to
 specify the logical tensor dimenions from the fastest-to slowest-varying.
 It may optionally support CGA level organization like other encoding
 attributes too, for example,
@@ -461,11 +461,11 @@ attributes too, for example,
       }]>,
   ];
 
-  let extraClassDeclaration = extraBaseClassDeclaration#[{
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
     unsigned getRank() const { return getOrder().size(); }
     int32_t getAlignment() const { return 16; }
 
-    PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape) const;
+    PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape) const;
 
     // Returns the total number of elements including padding given the input
     // tensor shape.
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index d62107c2d071..cdae51375083 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -840,19 +840,33 @@ class ColumnAction {
   std::string toString() const;
 };
 
-class PaddedLayout {
+// A utility class to describe a particular padding schema with 1) a linear
+// layout to desribe the N-D logical element mapping and 2) a list of
+// interval-padding pairs to describe the 1-D physical shared memory padding
+// schedules. In a degenerated cases, we can have no interval-padding pairs so
+// it will just be a normal linear layout.
+//
+// In Triton we use linear layout basically throughout; so this class is useful
+// to leverage common linear layout facilities and code paths as much as
+// possible, while factoring in shared memory padding wherever necessary--only
+// at the very final steps when we allocating the physical shared memory or
+// creating pointers indexing into them. All steps before can still reason with
+// linear layout. Thus this utility class keeps these two parts separate.
+class PaddedLinearLayout {
 public:
-  PaddedLayout(LinearLayout linearMapping, ArrayRef<unsigned> intervals,
-               ArrayRef<unsigned> paddings);
+  PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
+                     ArrayRef<unsigned> paddings);
 
-  const LinearLayout &getLinearMapping() const { return linearMapping; }
+  const LinearLayout &getLinear() const { return linear; }
 
+  // Returns the minimal interval that would trigger padding.
   std::optional<int32_t> getMinInterval() const;
 
-  bool hasNoPadding() const;
+  // Returns true if this is not a degenerated case and indeed requires padding.
+  bool hasPadding() const;
 
 private:
-  LinearLayout linearMapping;
+  LinearLayout linear;
   SmallVector<std::pair<unsigned, unsigned>> intervalPads;
 };
 
diff --git a/lib/Analysis/Allocation.cpp b/lib/Analysis/Allocation.cpp
index 9dbe0c87108d..9325e2309713 100644
--- a/lib/Analysis/Allocation.cpp
+++ b/lib/Analysis/Allocation.cpp
@@ -261,17 +261,16 @@ class AllocationAnalysis {
     if (!alloc || !alloc.isSharedMemoryAlloc())
       return;
     auto allocType = alloc.getType();
-    int64_t bytes = 0;
+    int64_t numElems = 0;
     if (auto paddedLayout =
             dyn_cast<gpu::PaddedSharedEncodingAttr>(allocType.getEncoding())) {
       SmallVector<int64_t> unpaddedShape = gpu::getShapePerCTA(allocType);
-      bytes = paddedLayout.getPaddedSize(unpaddedShape) *
-              allocType.getElementTypeBitWidth() / 8;
+      numElems = paddedLayout.getPaddedSize(unpaddedShape);
     } else {
       auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
-      bytes = product<int64_t>(shapePerCTA) *
-              allocType.getElementTypeBitWidth() / 8;
+      numElems = product<int64_t>(shapePerCTA);
     }
+    int64_t bytes = numElems * allocType.getElementTypeBitWidth() / 8;
 
     auto alignment = alloc.getAlignmentOrDefault();
     allocation->addBuffer<BufferT::BufferKind::Explicit>(alloc, bytes,
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 4b180958f478..451080607efb 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -397,6 +397,10 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
   // We propose case 2 (see comments below), which provides a more general
   // solution for all swizzled shared memory scenarios, including the edge case
   // mentioned above.
+  //
+  // Padded shared layout falls into case 1--we can rely on the logic for case 1
+  // to get the 1-D offset into shared memory. Then we just need to add the
+  // padding offset.
   if (isSimpleSharedMemoryAccess(shape, allocShape, sharedEnc)) { // Case 1
     smemOffset = applyLinearLayout(loc, rewriter, regToSharedLayout,
                                    {{kRegister, regId},
@@ -425,6 +429,18 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
       smemOffset = dot(rewriter, loc, smemOffsets,
                        applyPermutation(smemStrides, smemOrder));
     }
+    if (auto paddedLayout =
+            dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc)) {
+      // Apply the offset needed for padding.
+      Value padOffset = b.i32_val(0);
+      for (auto [interval, padding] : llvm::zip_equal(
+               paddedLayout.getIntervals(), paddedLayout.getPaddings())) {
+        Value iVal = b.i32_val(llvm::Log2_32(interval));
+        Value pVal = b.i32_val(llvm::Log2_32(padding));
+        padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal));
+      }
+      smemOffset = b.add(smemOffset, padOffset);
+    }
   } else { // Case 2 -> rank-reduced swizzling
     assert(rank >= 2 && "Swizzling only applies to tensors with rank >= 2");
     assert((isa<triton::gpu::SwizzledSharedEncodingAttr,
@@ -470,17 +486,6 @@ Value getSmemVecAddr(const LinearLayout &regLayout,
     Value baseToAllocBaseDist = dot(rewriter, loc, smemOffsets, smemStrides);
     smemOffset = b.sub(smemOffset, baseToAllocBaseDist);
   }
-  if (auto paddedLayout =
-          dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc)) {
-    Value padOffset = b.i32_val(0);
-    for (auto [interval, padding] : llvm::zip_equal(
-             paddedLayout.getIntervals(), paddedLayout.getPaddings())) {
-      Value iVal = b.i32_val(llvm::Log2_32(interval));
-      Value pVal = b.i32_val(llvm::Log2_32(padding));
-      padOffset = b.add(padOffset, b.shl(b.ashr(smemOffset, iVal), pVal));
-    }
-    smemOffset = b.add(smemOffset, padOffset);
-  }
   auto ptrTy = smemBase.getType();
   auto vecAddr = b.gep(ptrTy, elemLlvmTy, smemBase, smemOffset,
                        LLVM::GEPNoWrapFlags::inbounds);
@@ -504,10 +509,10 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  PaddedLayout paddedLayout =
-      triton::gpu::toPaddedLayout(shape, sharedTy.getEncoding());
+  PaddedLinearLayout paddedLayout =
+      triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
-      regLayout.invertAndCompose(paddedLayout.getLinearMapping());
+      regLayout.invertAndCompose(paddedLayout.getLinear());
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
@@ -550,7 +555,8 @@ bool emitTransferBetweenRegistersAndShared(
   // Thus we use `pseudoinvert` instead of `invert` here for simplicity.
   auto allocShape = sharedTy.getAllocShape();
   auto invertAllocSharedLayout = LinearLayout::empty();
-  if (paddedLayout.hasNoPadding()) {
+  if (!paddedLayout.hasPadding()) {
+    // For now this is only needed for the cases where we have swizzling.
     invertAllocSharedLayout =
         triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
                                     sharedTy.getEncoding())
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 0c46393930e8..592483d5885f 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1700,13 +1700,13 @@ LogicalResult PaddedSharedEncodingAttr::verify(
   return verifyLayoutOrder(emitError, order);
 }
 
-PaddedLayout
-PaddedSharedEncodingAttr::toPaddedLayout(ArrayRef<int64_t> shape) const {
+PaddedLinearLayout
+PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef<int64_t> shape) const {
   auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
       getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
       getCTALayout());
   LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr);
-  return PaddedLayout(ll, getIntervals(), getPaddings());
+  return PaddedLinearLayout(ll, getIntervals(), getPaddings());
 }
 
 int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 1102f2881ac2..38d0a11effbb 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1120,14 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout) {
                                                                    layout);
 }
 
-PaddedLayout toPaddedLayout(ArrayRef<int64_t> shape, Attribute layout) {
-  auto *ctx = layout.getContext();
-  if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout)) {
-    return paddedLayout.toPaddedLayout(shape);
-  }
-  auto ll =
-      ctx->getLoadedDialect<TritonGPUDialect>()->toLinearLayout(shape, layout);
-  return PaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{});
+PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
+                                        Attribute layout) {
+  if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout))
+    return paddedLayout.toPaddedLinearLayout(shape);
+  auto ll = toLinearLayout(shape, layout);
+  return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{});
 }
 
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout) {
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 214407c6d628..35ae36cfa98e 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,25 +1337,21 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
-PaddedLayout::PaddedLayout(LinearLayout linearMapping,
-                           ArrayRef<unsigned> intervals,
-                           ArrayRef<unsigned> paddings)
-    : linearMapping(std::move(linearMapping)) {
+PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear,
+                                       ArrayRef<unsigned> intervals,
+                                       ArrayRef<unsigned> paddings)
+    : linear(std::move(linear)) {
   intervalPads.reserve(intervals.size());
   for (auto [i, p] : llvm::zip_equal(intervals, paddings))
     intervalPads.emplace_back(i, p);
 }
 
-std::optional<int32_t> PaddedLayout::getMinInterval() const {
+std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
   if (intervalPads.empty())
     return std::nullopt;
   return *llvm::min_element(llvm::make_first_range(intervalPads));
 }
 
-bool PaddedLayout::hasNoPadding() const {
-  return intervalPads.empty() ||
-         llvm::all_of(llvm::make_second_range(intervalPads),
-                      [](unsigned v) { return v == 0; });
-}
+bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); }
 
 } // namespace mlir::triton

From d59edb862999d060b2cb30da794c152f3fa2fd41 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Wed, 18 Jun 2025 03:21:34 +0000
Subject: [PATCH 08/23] Add linear layout conversion test

---
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |  7 ++++--
 .../TritonGPU/LinearLayoutConversionsTest.cpp | 25 +++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index 335ced1f857d..0a390b398957 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -433,10 +433,13 @@ attributes too, for example,
         CTAOrder = [0, 1]}>
   }];
 
-  let parameters = (ins ArrayRefParameter<"unsigned">:$intervals,
+  let parameters = (ins
+      ArrayRefParameter<"unsigned">:$intervals,
       ArrayRefParameter<"unsigned">:$paddings,
       // Order of logical tensor dimensions; fastest-varying first.
-      ArrayRefParameter<"unsigned">:$order, "CTALayoutAttr":$CTALayout);
+      ArrayRefParameter<"unsigned">:$order,
+      "CTALayoutAttr":$CTALayout
+  );
 
   let builders =
       [AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index dbab14c1091e..a369feb665f2 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3,6 +3,7 @@
 #include "mlir/IR/MLIRContext.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Signals.h"
@@ -99,6 +100,15 @@ class LinearLayoutConversionsTest : public ::testing::Test {
         CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd));
   }
 
+  PaddedSharedEncodingAttr
+  paddedShared(ArrayRef<unsigned> intervals, ArrayRef<unsigned> paddings,
+               ArrayRef<unsigned> ord, ArrayRef<unsigned> cpg,
+               ArrayRef<unsigned> cSplit, ArrayRef<unsigned> cOrd) {
+    return PaddedSharedEncodingAttr::get(
+        &ctx, intervals, paddings, ord,
+        CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd));
+  }
+
   NVMMASharedEncodingAttr
   nvmmaShared(unsigned swizzleSizeInBytes, bool transposed,
               unsigned elementBitWidth, ArrayRef<unsigned> cpg,
@@ -3002,6 +3012,21 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) {
                     {S("dim0"), S("dim1")}));
 }
 
+TEST_F(LinearLayoutConversionsTest, PaddedShared) {
+  PaddedLinearLayout pll =
+      toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0},
+                                                  {1, 1}, {1, 1}, {1, 0}));
+  // The expected linear layout mapping part should just be an identity.
+  auto expectedLL = LinearLayout(
+      // clang-format off
+      {{S("offset"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {0, 32},
+                     {1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}}},
+       {S("block"), {}}},
+      {S("dim0"), S("dim1")});
+  // clang-format on
+  EXPECT_EQ(pll.getLinear(), expectedLL);
+}
+
 } // anonymous namespace
 } // namespace mlir::triton::gpu
 

From b62287084cc2c8f272e551082cc439fa4a66ef15 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sun, 15 Jun 2025 16:37:42 -0700
Subject: [PATCH 09/23] Wire up StreamPipeline usage

---
 include/triton/Tools/Sys/GetEnv.hpp           |  1 +
 python/src/passes.h                           |  6 +++
 python/triton/knobs.py                        |  1 +
 third_party/amd/backend/compiler.py           |  4 +-
 .../include/TritonAMDGPUTransforms/Passes.td  | 25 +++++----
 .../TritonAMDGPUTransforms/StreamPipeline.cpp | 51 +++++++++++--------
 third_party/amd/python/triton_amd.cc          |  4 +-
 7 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
index 113126787c28..d54faf3a4da6 100644
--- a/include/triton/Tools/Sys/GetEnv.hpp
+++ b/include/triton/Tools/Sys/GetEnv.hpp
@@ -37,6 +37,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
+    "TRITON_HIP_USE_PADDED_SHARED_LAYOUT",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
     "TRITON_OVERRIDE_ARCH",
diff --git a/python/src/passes.h b/python/src/passes.h
index 629fe362d8b2..66fdf0df1d1c 100644
--- a/python/src/passes.h
+++ b/python/src/passes.h
@@ -36,3 +36,9 @@
 #define ADD_PASS_OPTION_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3)           \
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
                  ty3 val3) { pm.addPass(builder({val0, val1, val2, val3})); })
+
+#define ADD_PASS_OPTION_WRAPPER_5(name, builder, ty0, ty1, ty2, ty3, ty4)      \
+  m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
+                 ty3 val3, ty4 val4) {                                         \
+    pm.addPass(builder({val0, val1, val2, val3, val4}));                       \
+  })
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
index b3c70c7174c0..e82fbc93535a 100644
--- a/python/triton/knobs.py
+++ b/python/triton/knobs.py
@@ -446,6 +446,7 @@ class amd_knobs(base_knobs):
     global_prefetch: env_int = env_int("TRITON_HIP_GLOBAL_PREFETCH")
     local_prefetch: env_int = env_int("TRITON_HIP_LOCAL_PREFETCH")
     use_async_copy: env_bool = env_bool("TRITON_HIP_USE_ASYNC_COPY")
+    use_padded_shared_layout : env_bool = env_bool("TRITON_HIP_USE_PADDED_SHARED_LAYOUT")
     scalarize_packed_fops: env_bool = env_bool("AMDGCN_SCALARIZE_PACKED_FOPS")
 
 
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
index bff54c2604bd..6269adccfd72 100644
--- a/third_party/amd/backend/compiler.py
+++ b/third_party/amd/backend/compiler.py
@@ -237,12 +237,14 @@ def make_ttgir(mod, metadata, options):
         global_prefetch = knobs.amd.global_prefetch
         local_prefetch = knobs.amd.local_prefetch
         use_async_copy = knobs.amd.use_async_copy
+        use_padded_shared_layout = knobs.amd.use_padded_shared_layout
 
         # The `local-prefetch` scheduling variant requires turning on buffer ops.
         if options.schedule_hint == "local-prefetch":
             global_prefetch = local_prefetch = 1
 
-        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy)
+        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch,
+                                               use_async_copy, use_padded_shared_layout)
         if use_async_copy:
             amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
         passes.common.add_canonicalizer(pm)
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
index 0d88e56c2e23..e9479a02cc92 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -13,19 +13,18 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod
 
   let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
 
-  let options = [
-    Option<"numStages", "num_stages",
-           "int32_t", /*default*/"2",
-           "Number of Pipeline stages">,
-    Option<"globalPrefetch", "global_prefetch",
-           "int32_t", /*default*/"0",
-           "Set global prefetch stage count">,
-    Option<"localPrefetch", "local_prefetch",
-           "int32_t", /*default*/"0",
-           "Set local prefetch stage count">,
-    Option<"useAsyncCopy", "use_async_copy",
-           "bool", /*default*/"false",
-           "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
+  let options =
+      [Option<"numStages", "num_stages", "int32_t", /*default*/ "2",
+              "Number of Pipeline stages">,
+       Option<"globalPrefetch", "global_prefetch", "int32_t", /*default*/ "0",
+              "Set global prefetch stage count">,
+       Option<"localPrefetch", "local_prefetch", "int32_t", /*default*/ "0",
+              "Set local prefetch stage count">,
+       Option<"useAsyncCopy", "use_async_copy", "bool", /*default*/ "false",
+              "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
+       Option<"usePaddedSharedLayout", "use_padded_shared_layout", "bool",
+              /*default*/ "false",
+              "Use padded shared layout for shared memory">,
   ];
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
index 0c9af9384958..f8ce51a6a595 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -8,6 +8,7 @@
 #include "triton/Dialect/Triton/IR/OpInterfaces.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Schedule.h"
@@ -122,7 +123,7 @@ enum SchedType {
 
 struct LoadInfo {
   // Shared layout is used for loads feeding into dot ops.
-  ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr;
+  ttg::SharedEncodingTrait sharedEncoding = nullptr;
   // The distance of this load's stage to its use' stage.
   int distToUse = 0;
   Operation *use = nullptr;
@@ -407,22 +408,22 @@ static ttg::AMDMfmaEncodingAttr getDotEncoding(Value inputValue,
 // If all the transitive uses of the given value have are used by a convert to
 // the same dot operand encoding, return true and get the shared encoding that
 // needs to be used to be compatible with users' layouts.
-static std::optional<ttg::SwizzledSharedEncodingAttr>
-getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
-  ttg::SwizzledSharedEncodingAttr attr;
+static std::optional<ttg::SharedEncodingTrait>
+getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) {
+  ttg::SharedEncodingTrait attr;
   for (Operation *user : loadedValue.getUsers()) {
     LDBG(" getSharedEncIfAllUsersAreDotEnc current user: " << *user);
     if (user->getNumResults() != 1)
       return std::nullopt;
 
-    ttg::SwizzledSharedEncodingAttr tempAttr;
+    ttg::SharedEncodingTrait tempAttr;
     Value userResult = user->getResult(0);
     Type userResType = userResult.getType();
     if (auto memDesc = dyn_cast<ttg::MemDescType>(userResType)) {
       // First time we find a shared encoding in the chain, save it and try to
       // use it if it is compatible with the other users.
-      tempAttr = cast<ttg::SwizzledSharedEncodingAttr>(memDesc.getEncoding());
-      if (!getSharedEncIfAllUsersAreDotEnc(userResult).has_value())
+      tempAttr = cast<ttg::SharedEncodingTrait>(memDesc.getEncoding());
+      if (!getSharedEncIfAllUsersAreDotEnc(usePaddedLayout, userResult))
         return std::nullopt;
     } else {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
@@ -449,9 +450,15 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
 
       auto userResEnc = cast<ttg::TensorOrMemDesc>(userResType).getEncoding();
       if (auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(userResEnc)) {
-        tempAttr = ttg::SwizzledSharedEncodingAttr::get(
-            loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder,
-            ctaLayout, bitWidth, /*needTrans=*/false);
+        if (usePaddedLayout) {
+          tempAttr = ttg::PaddedSharedEncodingAttr::get(
+              loadedValue.getContext(), srcTy.getShape(), sharedOrder,
+              dotOpEnc.getKWidth(), bitWidth, ctaLayout);
+        } else {
+          tempAttr = ttg::SwizzledSharedEncodingAttr::get(
+              loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder,
+              ctaLayout, bitWidth, /*needTrans=*/false);
+        }
       } else if (auto llEnc = dyn_cast<ttg::LinearEncodingAttr>(userResEnc)) {
         // We use linear layout directly for scaled dot fp8 operands. For such
         // cases, we need to look further down the def-use chain to find the dot
@@ -481,7 +488,8 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
 // "1" for the load op used by the load op used by the dot op, and so on.
 FailureOr<llvm::MapVector<Operation *, LoadInfo>>
 findPipelineableLoads(scf::ForOp forOp,
-                      tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
+                      tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
+                      bool usePaddedLayout) {
   llvm::MapVector<Operation *, LoadInfo> loadToInfo;
   DenseSet<Operation *> seen;
   // Recursively visit the given op and its operands to discover all load ops
@@ -503,7 +511,7 @@ findPipelineableLoads(scf::ForOp forOp,
                  "Block ptr should have been lowered before this pass.");
           auto ptr = loadOp.getPtr();
           if (auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType())) {
-            ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr;
+            ttg::SharedEncodingTrait sharedEncoding = nullptr;
             // Store memory layouts if possible.
             if (isa<tt::DotOpInterface>(use)) {
               unsigned vecContiguity = axisInfoAnalysis.getContiguity(ptr);
@@ -518,9 +526,9 @@ findPipelineableLoads(scf::ForOp forOp,
               // Limit shared memory sharing to width >= 32 elements.
               LDBG("Load " << *loadOp << " has width " << width);
               if (width >= 32) {
-                sharedEncoding =
-                    getSharedEncIfAllUsersAreDotEnc(op->getResult(0))
-                        .value_or(nullptr);
+                sharedEncoding = getSharedEncIfAllUsersAreDotEnc(
+                                     usePaddedLayout, op->getResult(0))
+                                     .value_or(nullptr);
               } else if (isaFamily != triton::AMD::ISAFamily::CDNA4) {
                 LDBG("Skip width<32 load " << loadOp << " for arch " << arch);
                 return;
@@ -795,6 +803,7 @@ SmallVector<std::pair<Operation *, Value>> createAndScheduleStreamOps(
 LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
                                              int stages[SCHED_SIZE],
                                              bool useAsyncCopy,
+                                             bool usePaddedLayout,
                                              tt::PipeliningOption &options) {
   triton::AMD::ModuleAxisInfoAnalysis axisInfoAnalysis(
       forOp->getParentOfType<ModuleOp>());
@@ -805,7 +814,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
   // Schedule the loads and root ops (dot ops) in the loop. This will give us
   // a scaffold for the final schedule.
   FailureOr<llvm::MapVector<Operation *, LoadInfo>> loadToInfo =
-      findPipelineableLoads(forOp, axisInfoAnalysis);
+      findPipelineableLoads(forOp, axisInfoAnalysis, usePaddedLayout);
   if (failed(loadToInfo))
     return failure();
 
@@ -875,7 +884,8 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
 }
 
 LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
-                           int localPrefetch, bool useAsyncCopy) {
+                           int localPrefetch, bool useAsyncCopy,
+                           bool usePaddedLayout) {
 
   int lastStage = numStages - 1;
   int stages[SCHED_SIZE];
@@ -903,8 +913,8 @@ LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
     }
   };
 
-  if (failed(preprocessLoopAndBuildSchedule(forOp, numStages, stages,
-                                            useAsyncCopy, options)))
+  if (failed(preprocessLoopAndBuildSchedule(
+          forOp, numStages, stages, useAsyncCopy, usePaddedLayout, options)))
     return failure();
   LDBG("Loop before sending to expander:\n" << *forOp);
 
@@ -1002,7 +1012,8 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
       if (!checkPrecondition(forOp))
         continue;
       (void)pipelineLoop(forOp, tt::getNumStagesOrDefault(forOp, numStages),
-                         globalPrefetch, localPrefetch, useAsyncCopy);
+                         globalPrefetch, localPrefetch, useAsyncCopy,
+                         usePaddedSharedLayout);
     }
 
     if (useAsyncCopy) {
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
index cab32eda84cb..c56503981a5e 100644
--- a/third_party/amd/python/triton_amd.cc
+++ b/third_party/amd/python/triton_amd.cc
@@ -77,9 +77,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_fold_true_cmpi", mlir::createTritonAMDFoldTrueCmpI);
   ADD_PASS_OPTION_WRAPPER_1("add_block_pingpong",
                             mlir::createTritonAMDGPUBlockPingpong, int32_t);
-  ADD_PASS_OPTION_WRAPPER_4("add_stream_pipeline",
+  ADD_PASS_OPTION_WRAPPER_5("add_stream_pipeline",
                             mlir::createTritonAMDGPUStreamPipeline, int, int,
-                            int, bool);
+                            int, bool, bool);
   ADD_PASS_OPTION_WRAPPER_1("add_coalesce_async_copy",
                             mlir::createTritonAMDGPUCoalesceAsyncCopy,
                             std::string);

From 961ecc4178a7c3f85da92e6499d431b9ffeaaa0e Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Wed, 18 Jun 2025 03:28:13 +0000
Subject: [PATCH 10/23] Revert "Wire up StreamPipeline usage"

This reverts commit b36f6c3f53883ea762d55b4c39131c243f6e53d5.
---
 include/triton/Tools/Sys/GetEnv.hpp           |  1 -
 python/src/passes.h                           |  6 ---
 python/triton/knobs.py                        |  1 -
 third_party/amd/backend/compiler.py           |  4 +-
 .../include/TritonAMDGPUTransforms/Passes.td  | 25 ++++-----
 .../TritonAMDGPUTransforms/StreamPipeline.cpp | 51 ++++++++-----------
 third_party/amd/python/triton_amd.cc          |  4 +-
 7 files changed, 36 insertions(+), 56 deletions(-)

diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
index d54faf3a4da6..113126787c28 100644
--- a/include/triton/Tools/Sys/GetEnv.hpp
+++ b/include/triton/Tools/Sys/GetEnv.hpp
@@ -37,7 +37,6 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_HIP_USE_ASYNC_COPY",
     "TRITON_HIP_USE_BLOCK_PINGPONG",
     "TRITON_HIP_USE_IN_THREAD_TRANSPOSE",
-    "TRITON_HIP_USE_PADDED_SHARED_LAYOUT",
     "TRITON_LLVM_DEBUG_ONLY",
     "TRITON_ENABLE_ASAN",
     "TRITON_OVERRIDE_ARCH",
diff --git a/python/src/passes.h b/python/src/passes.h
index 66fdf0df1d1c..629fe362d8b2 100644
--- a/python/src/passes.h
+++ b/python/src/passes.h
@@ -36,9 +36,3 @@
 #define ADD_PASS_OPTION_WRAPPER_4(name, builder, ty0, ty1, ty2, ty3)           \
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
                  ty3 val3) { pm.addPass(builder({val0, val1, val2, val3})); })
-
-#define ADD_PASS_OPTION_WRAPPER_5(name, builder, ty0, ty1, ty2, ty3, ty4)      \
-  m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
-                 ty3 val3, ty4 val4) {                                         \
-    pm.addPass(builder({val0, val1, val2, val3, val4}));                       \
-  })
diff --git a/python/triton/knobs.py b/python/triton/knobs.py
index e82fbc93535a..b3c70c7174c0 100644
--- a/python/triton/knobs.py
+++ b/python/triton/knobs.py
@@ -446,7 +446,6 @@ class amd_knobs(base_knobs):
     global_prefetch: env_int = env_int("TRITON_HIP_GLOBAL_PREFETCH")
     local_prefetch: env_int = env_int("TRITON_HIP_LOCAL_PREFETCH")
     use_async_copy: env_bool = env_bool("TRITON_HIP_USE_ASYNC_COPY")
-    use_padded_shared_layout : env_bool = env_bool("TRITON_HIP_USE_PADDED_SHARED_LAYOUT")
     scalarize_packed_fops: env_bool = env_bool("AMDGCN_SCALARIZE_PACKED_FOPS")
 
 
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
index 6269adccfd72..bff54c2604bd 100644
--- a/third_party/amd/backend/compiler.py
+++ b/third_party/amd/backend/compiler.py
@@ -237,14 +237,12 @@ def make_ttgir(mod, metadata, options):
         global_prefetch = knobs.amd.global_prefetch
         local_prefetch = knobs.amd.local_prefetch
         use_async_copy = knobs.amd.use_async_copy
-        use_padded_shared_layout = knobs.amd.use_padded_shared_layout
 
         # The `local-prefetch` scheduling variant requires turning on buffer ops.
         if options.schedule_hint == "local-prefetch":
             global_prefetch = local_prefetch = 1
 
-        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch,
-                                               use_async_copy, use_padded_shared_layout)
+        amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages, global_prefetch, local_prefetch, use_async_copy)
         if use_async_copy:
             amd.passes.ttgpuir.add_coalesce_async_copy(pm, options.arch)
         passes.common.add_canonicalizer(pm)
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
index e9479a02cc92..0d88e56c2e23 100644
--- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
+++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -13,18 +13,19 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod
 
   let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect"];
 
-  let options =
-      [Option<"numStages", "num_stages", "int32_t", /*default*/ "2",
-              "Number of Pipeline stages">,
-       Option<"globalPrefetch", "global_prefetch", "int32_t", /*default*/ "0",
-              "Set global prefetch stage count">,
-       Option<"localPrefetch", "local_prefetch", "int32_t", /*default*/ "0",
-              "Set local prefetch stage count">,
-       Option<"useAsyncCopy", "use_async_copy", "bool", /*default*/ "false",
-              "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
-       Option<"usePaddedSharedLayout", "use_padded_shared_layout", "bool",
-              /*default*/ "false",
-              "Use padded shared layout for shared memory">,
+  let options = [
+    Option<"numStages", "num_stages",
+           "int32_t", /*default*/"2",
+           "Number of Pipeline stages">,
+    Option<"globalPrefetch", "global_prefetch",
+           "int32_t", /*default*/"0",
+           "Set global prefetch stage count">,
+    Option<"localPrefetch", "local_prefetch",
+           "int32_t", /*default*/"0",
+           "Set local prefetch stage count">,
+    Option<"useAsyncCopy", "use_async_copy",
+           "bool", /*default*/"false",
+           "Use AsyncCopyGlobalToLocal to directly load to shared memory">,
   ];
 }
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
index f8ce51a6a595..0c9af9384958 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -8,7 +8,6 @@
 #include "triton/Dialect/Triton/IR/OpInterfaces.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h"
 #include "triton/Dialect/TritonGPU/Transforms/Schedule.h"
@@ -123,7 +122,7 @@ enum SchedType {
 
 struct LoadInfo {
   // Shared layout is used for loads feeding into dot ops.
-  ttg::SharedEncodingTrait sharedEncoding = nullptr;
+  ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr;
   // The distance of this load's stage to its use' stage.
   int distToUse = 0;
   Operation *use = nullptr;
@@ -408,22 +407,22 @@ static ttg::AMDMfmaEncodingAttr getDotEncoding(Value inputValue,
 // If all the transitive uses of the given value have are used by a convert to
 // the same dot operand encoding, return true and get the shared encoding that
 // needs to be used to be compatible with users' layouts.
-static std::optional<ttg::SharedEncodingTrait>
-getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) {
-  ttg::SharedEncodingTrait attr;
+static std::optional<ttg::SwizzledSharedEncodingAttr>
+getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
+  ttg::SwizzledSharedEncodingAttr attr;
   for (Operation *user : loadedValue.getUsers()) {
     LDBG(" getSharedEncIfAllUsersAreDotEnc current user: " << *user);
     if (user->getNumResults() != 1)
       return std::nullopt;
 
-    ttg::SharedEncodingTrait tempAttr;
+    ttg::SwizzledSharedEncodingAttr tempAttr;
     Value userResult = user->getResult(0);
     Type userResType = userResult.getType();
     if (auto memDesc = dyn_cast<ttg::MemDescType>(userResType)) {
       // First time we find a shared encoding in the chain, save it and try to
       // use it if it is compatible with the other users.
-      tempAttr = cast<ttg::SharedEncodingTrait>(memDesc.getEncoding());
-      if (!getSharedEncIfAllUsersAreDotEnc(usePaddedLayout, userResult))
+      tempAttr = cast<ttg::SwizzledSharedEncodingAttr>(memDesc.getEncoding());
+      if (!getSharedEncIfAllUsersAreDotEnc(userResult).has_value())
         return std::nullopt;
     } else {
       if (!isa<ttg::LocalLoadOp, ttg::ConvertLayoutOp>(user))
@@ -450,15 +449,9 @@ getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) {
 
       auto userResEnc = cast<ttg::TensorOrMemDesc>(userResType).getEncoding();
       if (auto dotOpEnc = dyn_cast<ttg::DotOperandEncodingAttr>(userResEnc)) {
-        if (usePaddedLayout) {
-          tempAttr = ttg::PaddedSharedEncodingAttr::get(
-              loadedValue.getContext(), srcTy.getShape(), sharedOrder,
-              dotOpEnc.getKWidth(), bitWidth, ctaLayout);
-        } else {
-          tempAttr = ttg::SwizzledSharedEncodingAttr::get(
-              loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder,
-              ctaLayout, bitWidth, /*needTrans=*/false);
-        }
+        tempAttr = ttg::SwizzledSharedEncodingAttr::get(
+            loadedValue.getContext(), dotOpEnc, srcTy.getShape(), sharedOrder,
+            ctaLayout, bitWidth, /*needTrans=*/false);
       } else if (auto llEnc = dyn_cast<ttg::LinearEncodingAttr>(userResEnc)) {
         // We use linear layout directly for scaled dot fp8 operands. For such
         // cases, we need to look further down the def-use chain to find the dot
@@ -488,8 +481,7 @@ getSharedEncIfAllUsersAreDotEnc(bool usePaddedLayout, Value loadedValue) {
 // "1" for the load op used by the load op used by the dot op, and so on.
 FailureOr<llvm::MapVector<Operation *, LoadInfo>>
 findPipelineableLoads(scf::ForOp forOp,
-                      tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
-                      bool usePaddedLayout) {
+                      tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
   llvm::MapVector<Operation *, LoadInfo> loadToInfo;
   DenseSet<Operation *> seen;
   // Recursively visit the given op and its operands to discover all load ops
@@ -511,7 +503,7 @@ findPipelineableLoads(scf::ForOp forOp,
                  "Block ptr should have been lowered before this pass.");
           auto ptr = loadOp.getPtr();
           if (auto tensorTy = dyn_cast<RankedTensorType>(ptr.getType())) {
-            ttg::SharedEncodingTrait sharedEncoding = nullptr;
+            ttg::SwizzledSharedEncodingAttr sharedEncoding = nullptr;
             // Store memory layouts if possible.
             if (isa<tt::DotOpInterface>(use)) {
               unsigned vecContiguity = axisInfoAnalysis.getContiguity(ptr);
@@ -526,9 +518,9 @@ findPipelineableLoads(scf::ForOp forOp,
               // Limit shared memory sharing to width >= 32 elements.
               LDBG("Load " << *loadOp << " has width " << width);
               if (width >= 32) {
-                sharedEncoding = getSharedEncIfAllUsersAreDotEnc(
-                                     usePaddedLayout, op->getResult(0))
-                                     .value_or(nullptr);
+                sharedEncoding =
+                    getSharedEncIfAllUsersAreDotEnc(op->getResult(0))
+                        .value_or(nullptr);
               } else if (isaFamily != triton::AMD::ISAFamily::CDNA4) {
                 LDBG("Skip width<32 load " << loadOp << " for arch " << arch);
                 return;
@@ -803,7 +795,6 @@ SmallVector<std::pair<Operation *, Value>> createAndScheduleStreamOps(
 LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
                                              int stages[SCHED_SIZE],
                                              bool useAsyncCopy,
-                                             bool usePaddedLayout,
                                              tt::PipeliningOption &options) {
   triton::AMD::ModuleAxisInfoAnalysis axisInfoAnalysis(
       forOp->getParentOfType<ModuleOp>());
@@ -814,7 +805,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
   // Schedule the loads and root ops (dot ops) in the loop. This will give us
   // a scaffold for the final schedule.
   FailureOr<llvm::MapVector<Operation *, LoadInfo>> loadToInfo =
-      findPipelineableLoads(forOp, axisInfoAnalysis, usePaddedLayout);
+      findPipelineableLoads(forOp, axisInfoAnalysis);
   if (failed(loadToInfo))
     return failure();
 
@@ -884,8 +875,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
 }
 
 LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
-                           int localPrefetch, bool useAsyncCopy,
-                           bool usePaddedLayout) {
+                           int localPrefetch, bool useAsyncCopy) {
 
   int lastStage = numStages - 1;
   int stages[SCHED_SIZE];
@@ -913,8 +903,8 @@ LogicalResult pipelineLoop(scf::ForOp forOp, int numStages, int globalPrefetch,
     }
   };
 
-  if (failed(preprocessLoopAndBuildSchedule(
-          forOp, numStages, stages, useAsyncCopy, usePaddedLayout, options)))
+  if (failed(preprocessLoopAndBuildSchedule(forOp, numStages, stages,
+                                            useAsyncCopy, options)))
     return failure();
   LDBG("Loop before sending to expander:\n" << *forOp);
 
@@ -1012,8 +1002,7 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
       if (!checkPrecondition(forOp))
         continue;
       (void)pipelineLoop(forOp, tt::getNumStagesOrDefault(forOp, numStages),
-                         globalPrefetch, localPrefetch, useAsyncCopy,
-                         usePaddedSharedLayout);
+                         globalPrefetch, localPrefetch, useAsyncCopy);
     }
 
     if (useAsyncCopy) {
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc
index c56503981a5e..cab32eda84cb 100644
--- a/third_party/amd/python/triton_amd.cc
+++ b/third_party/amd/python/triton_amd.cc
@@ -77,9 +77,9 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_fold_true_cmpi", mlir::createTritonAMDFoldTrueCmpI);
   ADD_PASS_OPTION_WRAPPER_1("add_block_pingpong",
                             mlir::createTritonAMDGPUBlockPingpong, int32_t);
-  ADD_PASS_OPTION_WRAPPER_5("add_stream_pipeline",
+  ADD_PASS_OPTION_WRAPPER_4("add_stream_pipeline",
                             mlir::createTritonAMDGPUStreamPipeline, int, int,
-                            int, bool, bool);
+                            int, bool);
   ADD_PASS_OPTION_WRAPPER_1("add_coalesce_async_copy",
                             mlir::createTritonAMDGPUCoalesceAsyncCopy,
                             std::string);

From b1c6f946e710af66da3c0336519257bdecf68cb4 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Wed, 18 Jun 2025 23:48:36 +0000
Subject: [PATCH 11/23] Add some more tests

---
 test/Conversion/amd/tritongpu_to_llvm.mlir | 29 ++++++++++++++++++++++
 test/TritonGPU/invalid-attributes.mlir     | 10 ++++++++
 2 files changed, 39 insertions(+)

diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
index aea48d2a4d05..d4e758a24c18 100644
--- a/test/Conversion/amd/tritongpu_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -380,3 +380,32 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     tt.return
   }
 }
+
+// -----
+
+// CHECK-LABEL: padded_shared_layout
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [2, 2], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+#shared = #ttg.padded_shared<[128:+4, 256:+8] {order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @padded_shared_layout(%arg0: tensor<64x64xf16, #blocked>) {
+    // CHECK-DAG: %[[CST0:.+]] = llvm.mlir.constant(0 : i32)
+    // CHECK-DAG: %[[CST2:.+]] = llvm.mlir.constant(2 : i32)
+    // CHECK-DAG: %[[CST3:.+]] = llvm.mlir.constant(3 : i32)
+    // CHECK-DAG: %[[CST7:.+]] = llvm.mlir.constant(7 : i32)
+    // CHECK-DAG: %[[CST8:.+]] = llvm.mlir.constant(8 : i32)
+
+    //      CHECK: %[[SHR0:.+]] = llvm.ashr %[[XOR:.+]], %[[CST7]] : i32
+    // CHECK-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST2]] : i32
+    // CHECK-NEXT: %[[ADD0:.+]] = llvm.add %[[SHL0]], %[[CST0]] : i32
+    // CHECK-NEXT: %[[SHR1:.+]] = llvm.ashr %[[XOR]], %[[CST8]] : i32
+    // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %14 : i32
+    // CHECK-NEXT: %[[ADD1:.+]] = llvm.add %[[ADD0]], %[[SHL1]] : i32
+    // CHECK-NEXT: %[[ADD2:.+]] = llvm.add %[[XOR]], %[[ADD1]] : i32
+    // CHECK-NEXT: llvm.getelementptr inbounds %{{.+}}[%[[ADD2]]]
+
+    // CHECK-COUNT-16: llvm.store {{.*}} : vector<1xf16>, !llvm.ptr<3>
+    %0 = ttg.local_alloc %arg0 : (tensor<64x64xf16, #blocked>) -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/invalid-attributes.mlir b/test/TritonGPU/invalid-attributes.mlir
index 3a2aac907096..22938b6055b7 100644
--- a/test/TritonGPU/invalid-attributes.mlir
+++ b/test/TritonGPU/invalid-attributes.mlir
@@ -84,11 +84,21 @@
 
 // -----
 
+// expected-error@+1 {{interval values must all be power of two}}
+#shared = #ttg.padded_shared<[0:+2]>
+
+// -----
+
 // expected-error@+1 {{padding values must all be power of two}}
 #shared = #ttg.padded_shared<[2:+3]>
 
 // -----
 
+// expected-error@+1 {{padding values must all be power of two}}
+#shared = #ttg.padded_shared<[2:+0]>
+
+// -----
+
 // expected-error@+1 {{interval values cannot have duplicates}}
 #shared = #ttg.padded_shared<[2:+1, 2:+4]>
 

From 017b88816fad76ba31d37602414d49f399957a0d Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 00:33:57 +0000
Subject: [PATCH 12/23] Improve PaddedLinearLayout a bit

---
 include/triton/Tools/LinearLayout.h |  8 +++++---
 lib/Tools/LinearLayout.cpp          | 15 ++-------------
 2 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index cdae51375083..d4d7525c5f05 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -855,7 +855,8 @@ class ColumnAction {
 class PaddedLinearLayout {
 public:
   PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
-                     ArrayRef<unsigned> paddings);
+                     ArrayRef<unsigned> paddings)
+      : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
 
   const LinearLayout &getLinear() const { return linear; }
 
@@ -863,11 +864,12 @@ class PaddedLinearLayout {
   std::optional<int32_t> getMinInterval() const;
 
   // Returns true if this is not a degenerated case and indeed requires padding.
-  bool hasPadding() const;
+  bool hasPadding() const { return !intervals.empty(); }
 
 private:
   LinearLayout linear;
-  SmallVector<std::pair<unsigned, unsigned>> intervalPads;
+  SmallVector<unsigned, 2> intervals;
+  SmallVector<unsigned, 2> paddings;
 };
 
 } // namespace mlir::triton
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 35ae36cfa98e..9e990f141ce1 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,21 +1337,10 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
-PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear,
-                                       ArrayRef<unsigned> intervals,
-                                       ArrayRef<unsigned> paddings)
-    : linear(std::move(linear)) {
-  intervalPads.reserve(intervals.size());
-  for (auto [i, p] : llvm::zip_equal(intervals, paddings))
-    intervalPads.emplace_back(i, p);
-}
-
 std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
-  if (intervalPads.empty())
+  if (intervals.empty())
     return std::nullopt;
-  return *llvm::min_element(llvm::make_first_range(intervalPads));
+  return *llvm::min_element(intervals);
 }
 
-bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); }
-
 } // namespace mlir::triton

From c0f88a888a2f1fb83c0b72e2d3c4e91bf8034666 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 00:46:27 +0000
Subject: [PATCH 13/23] Fix lit test

---
 test/Conversion/amd/tritongpu_to_llvm.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
index d4e758a24c18..16784f2150b7 100644
--- a/test/Conversion/amd/tritongpu_to_llvm.mlir
+++ b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -399,7 +399,7 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     // CHECK-NEXT: %[[SHL0:.+]] = llvm.shl %[[SHR0]], %[[CST2]] : i32
     // CHECK-NEXT: %[[ADD0:.+]] = llvm.add %[[SHL0]], %[[CST0]] : i32
     // CHECK-NEXT: %[[SHR1:.+]] = llvm.ashr %[[XOR]], %[[CST8]] : i32
-    // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %14 : i32
+    // CHECK-NEXT: %[[SHL1:.+]] = llvm.shl %[[SHR1]], %[[CST3]] : i32
     // CHECK-NEXT: %[[ADD1:.+]] = llvm.add %[[ADD0]], %[[SHL1]] : i32
     // CHECK-NEXT: %[[ADD2:.+]] = llvm.add %[[XOR]], %[[ADD1]] : i32
     // CHECK-NEXT: llvm.getelementptr inbounds %{{.+}}[%[[ADD2]]]

From a66fa0d44f3a37ff5480b782c403517a8abd605f Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 03:33:55 +0000
Subject: [PATCH 14/23] Move builder out to cpp

---
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 28 ++++++-------------
 lib/Dialect/TritonGPU/IR/Dialect.cpp          | 23 +++++++++++++++
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index 0a390b398957..ff3340d9f79b 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -441,27 +441,15 @@ attributes too, for example,
       "CTALayoutAttr":$CTALayout
   );
 
-  let builders =
-      [AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
-                       "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout),
-                   [{
-        SmallVector<unsigned> intervals, paddings;
-        intervals.reserve(intervalPads.size());
-        paddings.reserve(intervalPads.size());
-        for (auto [interval, padding] : intervalPads) {
-            intervals.push_back(interval);
-            paddings.push_back(padding);
-        }
-        return get(context, intervals, paddings, order, ctaLayout);
-      }]>,
-       AttrBuilder<(ins "ArrayRef<int64_t>":$shape, "ArrayRef<unsigned>":$order,
+  let builders = [
+      AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
+                       "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
+      // Defines a padded shared layout that avoids bank conflicts for a block
+      // with given |shape| and |order| that is loaded by dot operand with
+      // kWidth |dotKWidth|.
+      AttrBuilder<(ins "ArrayRef<int64_t>":$shape, "ArrayRef<unsigned>":$order,
                        "unsigned":$dotKWidth, "unsigned":$elemBitWidth,
-                       "CTALayoutAttr":$ctaLayout),
-                   [{
-        unsigned inner = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]];
-        unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u);
-        return get(context, {{inner, threadNumBytes}}, order, ctaLayout);
-      }]>,
+                       "CTALayoutAttr":$ctaLayout)>,
   ];
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 592483d5885f..d64b49790ea0 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1700,6 +1700,29 @@ LogicalResult PaddedSharedEncodingAttr::verify(
   return verifyLayoutOrder(emitError, order);
 }
 
+PaddedSharedEncodingAttr PaddedSharedEncodingAttr::get(
+    MLIRContext *context, ArrayRef<std::pair<unsigned, unsigned>> intervalPads,
+    ArrayRef<unsigned> order, CTALayoutAttr ctaLayout) {
+  SmallVector<unsigned> intervals, paddings;
+  intervals.reserve(intervalPads.size());
+  paddings.reserve(intervalPads.size());
+  for (auto [interval, padding] : intervalPads) {
+    intervals.push_back(interval);
+    paddings.push_back(padding);
+  }
+  return get(context, intervals, paddings, order, ctaLayout);
+}
+
+PaddedSharedEncodingAttr
+PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef<int64_t> shape,
+                              ArrayRef<unsigned> order, unsigned dotKWidth,
+                              unsigned elemBitWidth, CTALayoutAttr ctaLayout) {
+  unsigned innerD = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]];
+  unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u);
+  threadNumBytes = llvm::alignTo(threadNumBytes, 4); // Assume 32-bit per bank
+  return get(context, {{innerD, threadNumBytes}}, order, ctaLayout);
+}
+
 PaddedLinearLayout
 PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef<int64_t> shape) const {
   auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(

From b5f258ef48dad7c627b6bf3965e77a6ad0a0af0b Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 03:53:52 +0000
Subject: [PATCH 15/23] Improve wording for PaddedLinearLayout once more

---
 include/triton/Tools/LinearLayout.h             | 17 +++++++----------
 lib/Conversion/TritonGPUToLLVM/Utility.cpp      |  2 +-
 .../TritonGPU/LinearLayoutConversionsTest.cpp   |  2 +-
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index d4d7525c5f05..63946b5d17b9 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -840,11 +840,12 @@ class ColumnAction {
   std::string toString() const;
 };
 
-// A utility class to describe a particular padding schema with 1) a linear
-// layout to desribe the N-D logical element mapping and 2) a list of
-// interval-padding pairs to describe the 1-D physical shared memory padding
-// schedules. In a degenerated cases, we can have no interval-padding pairs so
-// it will just be a normal linear layout.
+// A utility struct to describe either a swizzled or padded shared layout.
+//
+// For the former, we only use the linear layout field. For the latter, we use
+// both 1) a linear layout to desribe the identityStandardND logical element
+// mapping and 2) a list of interval-padding pairs to describe the 1-D physical
+// shared memory padding schedules.
 //
 // In Triton we use linear layout basically throughout; so this class is useful
 // to leverage common linear layout facilities and code paths as much as
@@ -852,21 +853,17 @@ class ColumnAction {
 // at the very final steps when we allocating the physical shared memory or
 // creating pointers indexing into them. All steps before can still reason with
 // linear layout. Thus this utility class keeps these two parts separate.
-class PaddedLinearLayout {
-public:
+struct PaddedLinearLayout {
   PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
                      ArrayRef<unsigned> paddings)
       : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
 
-  const LinearLayout &getLinear() const { return linear; }
-
   // Returns the minimal interval that would trigger padding.
   std::optional<int32_t> getMinInterval() const;
 
   // Returns true if this is not a degenerated case and indeed requires padding.
   bool hasPadding() const { return !intervals.empty(); }
 
-private:
   LinearLayout linear;
   SmallVector<unsigned, 2> intervals;
   SmallVector<unsigned, 2> paddings;
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 451080607efb..510d8dbb9ed2 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -512,7 +512,7 @@ bool emitTransferBetweenRegistersAndShared(
   PaddedLinearLayout paddedLayout =
       triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
-      regLayout.invertAndCompose(paddedLayout.getLinear());
+      regLayout.invertAndCompose(paddedLayout.linear);
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index a369feb665f2..f295e8c2c130 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3024,7 +3024,7 @@ TEST_F(LinearLayoutConversionsTest, PaddedShared) {
        {S("block"), {}}},
       {S("dim0"), S("dim1")});
   // clang-format on
-  EXPECT_EQ(pll.getLinear(), expectedLL);
+  EXPECT_EQ(pll.linear, expectedLL);
 }
 
 } // anonymous namespace

From 2de5b2f65ce2cf77bc8cf3ceed1a57e9861e3065 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 04:03:18 +0000
Subject: [PATCH 16/23] Rename to SwizzledOrPaddedLayout

---
 .../triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h | 4 ++--
 include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td  | 2 +-
 include/triton/Tools/LinearLayout.h                       | 6 +++---
 lib/Conversion/TritonGPUToLLVM/Utility.cpp                | 4 ++--
 lib/Dialect/TritonGPU/IR/Dialect.cpp                      | 6 +++---
 lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp      | 8 ++++----
 lib/Tools/LinearLayout.cpp                                | 2 +-
 .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp     | 6 +++---
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index d7ef07bc5bff..925c23d2f927 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -300,8 +300,8 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 
 // Convert the given layout to a linear layout with potential additional
 // physical memory paddings.
-PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
-                                        Attribute layout);
+SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape,
+                                                Attribute layout);
 
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index ff3340d9f79b..e3191f05c69a 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -456,7 +456,7 @@ attributes too, for example,
     unsigned getRank() const { return getOrder().size(); }
     int32_t getAlignment() const { return 16; }
 
-    PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape) const;
+    SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape) const;
 
     // Returns the total number of elements including padding given the input
     // tensor shape.
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index 63946b5d17b9..c4643e35cfe4 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -853,9 +853,9 @@ class ColumnAction {
 // at the very final steps when we allocating the physical shared memory or
 // creating pointers indexing into them. All steps before can still reason with
 // linear layout. Thus this utility class keeps these two parts separate.
-struct PaddedLinearLayout {
-  PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
-                     ArrayRef<unsigned> paddings)
+struct SwizzledOrPaddedLayout {
+  SwizzledOrPaddedLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
+                         ArrayRef<unsigned> paddings)
       : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
 
   // Returns the minimal interval that would trigger padding.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 510d8dbb9ed2..752ddb34328c 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -509,8 +509,8 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  PaddedLinearLayout paddedLayout =
-      triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
+  SwizzledOrPaddedLayout paddedLayout =
+      triton::gpu::toSwizzledOrPaddedLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
       regLayout.invertAndCompose(paddedLayout.linear);
 
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index d64b49790ea0..5e73b2b9a284 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1723,13 +1723,13 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef<int64_t> shape,
   return get(context, {{innerD, threadNumBytes}}, order, ctaLayout);
 }
 
-PaddedLinearLayout
-PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef<int64_t> shape) const {
+SwizzledOrPaddedLayout PaddedSharedEncodingAttr::toSwizzledOrPaddedLayout(
+    ArrayRef<int64_t> shape) const {
   auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
       getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
       getCTALayout());
   LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr);
-  return PaddedLinearLayout(ll, getIntervals(), getPaddings());
+  return SwizzledOrPaddedLayout(ll, getIntervals(), getPaddings());
 }
 
 int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 38d0a11effbb..57c76ed69bfd 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1120,12 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout) {
                                                                    layout);
 }
 
-PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
-                                        Attribute layout) {
+SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape,
+                                                Attribute layout) {
   if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout))
-    return paddedLayout.toPaddedLinearLayout(shape);
+    return paddedLayout.toSwizzledOrPaddedLayout(shape);
   auto ll = toLinearLayout(shape, layout);
-  return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{});
+  return SwizzledOrPaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{});
 }
 
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout) {
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 9e990f141ce1..f3630b851aef 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,7 +1337,7 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
-std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
+std::optional<int32_t> SwizzledOrPaddedLayout::getMinInterval() const {
   if (intervals.empty())
     return std::nullopt;
   return *llvm::min_element(intervals);
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index f295e8c2c130..b781ec7711cb 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3013,9 +3013,9 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) {
 }
 
 TEST_F(LinearLayoutConversionsTest, PaddedShared) {
-  PaddedLinearLayout pll =
-      toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0},
-                                                  {1, 1}, {1, 1}, {1, 0}));
+  SwizzledOrPaddedLayout pll = toSwizzledOrPaddedLayout(
+      {32, 64},
+      paddedShared({128, 256}, {4, 8}, {1, 0}, {1, 1}, {1, 1}, {1, 0}));
   // The expected linear layout mapping part should just be an identity.
   auto expectedLL = LinearLayout(
       // clang-format off

From 89d069dfef00d8d8cfdf3d170c2f2d46b564ffbf Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 15:18:25 +0000
Subject: [PATCH 17/23] Revert "Rename to SwizzledOrPaddedLayout"

This reverts commit 2de5b2f65ce2cf77bc8cf3ceed1a57e9861e3065.
---
 .../triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h | 4 ++--
 include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td  | 2 +-
 include/triton/Tools/LinearLayout.h                       | 6 +++---
 lib/Conversion/TritonGPUToLLVM/Utility.cpp                | 4 ++--
 lib/Dialect/TritonGPU/IR/Dialect.cpp                      | 6 +++---
 lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp      | 8 ++++----
 lib/Tools/LinearLayout.cpp                                | 2 +-
 .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp     | 6 +++---
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index 925c23d2f927..d7ef07bc5bff 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -300,8 +300,8 @@ std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 
 // Convert the given layout to a linear layout with potential additional
 // physical memory paddings.
-SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape,
-                                                Attribute layout);
+PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
+                                        Attribute layout);
 
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index e3191f05c69a..ff3340d9f79b 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -456,7 +456,7 @@ attributes too, for example,
     unsigned getRank() const { return getOrder().size(); }
     int32_t getAlignment() const { return 16; }
 
-    SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape) const;
+    PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape) const;
 
     // Returns the total number of elements including padding given the input
     // tensor shape.
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index c4643e35cfe4..63946b5d17b9 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -853,9 +853,9 @@ class ColumnAction {
 // at the very final steps when we allocating the physical shared memory or
 // creating pointers indexing into them. All steps before can still reason with
 // linear layout. Thus this utility class keeps these two parts separate.
-struct SwizzledOrPaddedLayout {
-  SwizzledOrPaddedLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
-                         ArrayRef<unsigned> paddings)
+struct PaddedLinearLayout {
+  PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
+                     ArrayRef<unsigned> paddings)
       : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
 
   // Returns the minimal interval that would trigger padding.
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 752ddb34328c..510d8dbb9ed2 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -509,8 +509,8 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  SwizzledOrPaddedLayout paddedLayout =
-      triton::gpu::toSwizzledOrPaddedLayout(shape, sharedTy.getEncoding());
+  PaddedLinearLayout paddedLayout =
+      triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
       regLayout.invertAndCompose(paddedLayout.linear);
 
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index 5e73b2b9a284..d64b49790ea0 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1723,13 +1723,13 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef<int64_t> shape,
   return get(context, {{innerD, threadNumBytes}}, order, ctaLayout);
 }
 
-SwizzledOrPaddedLayout PaddedSharedEncodingAttr::toSwizzledOrPaddedLayout(
-    ArrayRef<int64_t> shape) const {
+PaddedLinearLayout
+PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef<int64_t> shape) const {
   auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
       getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
       getCTALayout());
   LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr);
-  return SwizzledOrPaddedLayout(ll, getIntervals(), getPaddings());
+  return PaddedLinearLayout(ll, getIntervals(), getPaddings());
 }
 
 int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 57c76ed69bfd..38d0a11effbb 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1120,12 +1120,12 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout) {
                                                                    layout);
 }
 
-SwizzledOrPaddedLayout toSwizzledOrPaddedLayout(ArrayRef<int64_t> shape,
-                                                Attribute layout) {
+PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
+                                        Attribute layout) {
   if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout))
-    return paddedLayout.toSwizzledOrPaddedLayout(shape);
+    return paddedLayout.toPaddedLinearLayout(shape);
   auto ll = toLinearLayout(shape, layout);
-  return SwizzledOrPaddedLayout(ll, /*intervals=*/{}, /*paddings=*/{});
+  return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{});
 }
 
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout) {
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index f3630b851aef..9e990f141ce1 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,7 +1337,7 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
-std::optional<int32_t> SwizzledOrPaddedLayout::getMinInterval() const {
+std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
   if (intervals.empty())
     return std::nullopt;
   return *llvm::min_element(intervals);
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index b781ec7711cb..f295e8c2c130 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3013,9 +3013,9 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) {
 }
 
 TEST_F(LinearLayoutConversionsTest, PaddedShared) {
-  SwizzledOrPaddedLayout pll = toSwizzledOrPaddedLayout(
-      {32, 64},
-      paddedShared({128, 256}, {4, 8}, {1, 0}, {1, 1}, {1, 1}, {1, 0}));
+  PaddedLinearLayout pll =
+      toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0},
+                                                  {1, 1}, {1, 1}, {1, 0}));
   // The expected linear layout mapping part should just be an identity.
   auto expectedLL = LinearLayout(
       // clang-format off

From 28c3428bdb7de70a8752177a2617be54f58c2c56 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 15:18:27 +0000
Subject: [PATCH 18/23] Revert "Improve wording for PaddedLinearLayout once
 more"

This reverts commit b5f258ef48dad7c627b6bf3965e77a6ad0a0af0b.
---
 include/triton/Tools/LinearLayout.h             | 17 ++++++++++-------
 lib/Conversion/TritonGPUToLLVM/Utility.cpp      |  2 +-
 .../TritonGPU/LinearLayoutConversionsTest.cpp   |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index 63946b5d17b9..d4d7525c5f05 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -840,12 +840,11 @@ class ColumnAction {
   std::string toString() const;
 };
 
-// A utility struct to describe either a swizzled or padded shared layout.
-//
-// For the former, we only use the linear layout field. For the latter, we use
-// both 1) a linear layout to desribe the identityStandardND logical element
-// mapping and 2) a list of interval-padding pairs to describe the 1-D physical
-// shared memory padding schedules.
+// A utility class to describe a particular padding schema with 1) a linear
+// layout to desribe the N-D logical element mapping and 2) a list of
+// interval-padding pairs to describe the 1-D physical shared memory padding
+// schedules. In a degenerated cases, we can have no interval-padding pairs so
+// it will just be a normal linear layout.
 //
 // In Triton we use linear layout basically throughout; so this class is useful
 // to leverage common linear layout facilities and code paths as much as
@@ -853,17 +852,21 @@ class ColumnAction {
 // at the very final steps when we allocating the physical shared memory or
 // creating pointers indexing into them. All steps before can still reason with
 // linear layout. Thus this utility class keeps these two parts separate.
-struct PaddedLinearLayout {
+class PaddedLinearLayout {
+public:
   PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
                      ArrayRef<unsigned> paddings)
       : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
 
+  const LinearLayout &getLinear() const { return linear; }
+
   // Returns the minimal interval that would trigger padding.
   std::optional<int32_t> getMinInterval() const;
 
   // Returns true if this is not a degenerated case and indeed requires padding.
   bool hasPadding() const { return !intervals.empty(); }
 
+private:
   LinearLayout linear;
   SmallVector<unsigned, 2> intervals;
   SmallVector<unsigned, 2> paddings;
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 510d8dbb9ed2..451080607efb 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -512,7 +512,7 @@ bool emitTransferBetweenRegistersAndShared(
   PaddedLinearLayout paddedLayout =
       triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
   LinearLayout regToSharedLayout =
-      regLayout.invertAndCompose(paddedLayout.linear);
+      regLayout.invertAndCompose(paddedLayout.getLinear());
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index f295e8c2c130..a369feb665f2 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3024,7 +3024,7 @@ TEST_F(LinearLayoutConversionsTest, PaddedShared) {
        {S("block"), {}}},
       {S("dim0"), S("dim1")});
   // clang-format on
-  EXPECT_EQ(pll.linear, expectedLL);
+  EXPECT_EQ(pll.getLinear(), expectedLL);
 }
 
 } // anonymous namespace

From 0637bc5a0f3a283d283f5d864f92451f74eaf915 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 15:19:01 +0000
Subject: [PATCH 19/23] Revert "Improve PaddedLinearLayout a bit"

This reverts commit 017b88816fad76ba31d37602414d49f399957a0d.
---
 include/triton/Tools/LinearLayout.h |  8 +++-----
 lib/Tools/LinearLayout.cpp          | 15 +++++++++++++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index d4d7525c5f05..cdae51375083 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -855,8 +855,7 @@ class ColumnAction {
 class PaddedLinearLayout {
 public:
   PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
-                     ArrayRef<unsigned> paddings)
-      : linear(std::move(linear)), intervals(intervals), paddings(paddings) {}
+                     ArrayRef<unsigned> paddings);
 
   const LinearLayout &getLinear() const { return linear; }
 
@@ -864,12 +863,11 @@ class PaddedLinearLayout {
   std::optional<int32_t> getMinInterval() const;
 
   // Returns true if this is not a degenerated case and indeed requires padding.
-  bool hasPadding() const { return !intervals.empty(); }
+  bool hasPadding() const;
 
 private:
   LinearLayout linear;
-  SmallVector<unsigned, 2> intervals;
-  SmallVector<unsigned, 2> paddings;
+  SmallVector<std::pair<unsigned, unsigned>> intervalPads;
 };
 
 } // namespace mlir::triton
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 9e990f141ce1..35ae36cfa98e 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,10 +1337,21 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
+PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear,
+                                       ArrayRef<unsigned> intervals,
+                                       ArrayRef<unsigned> paddings)
+    : linear(std::move(linear)) {
+  intervalPads.reserve(intervals.size());
+  for (auto [i, p] : llvm::zip_equal(intervals, paddings))
+    intervalPads.emplace_back(i, p);
+}
+
 std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
-  if (intervals.empty())
+  if (intervalPads.empty())
     return std::nullopt;
-  return *llvm::min_element(intervals);
+  return *llvm::min_element(llvm::make_first_range(intervalPads));
 }
 
+bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); }
+
 } // namespace mlir::triton

From 8fa8d8db1d198babe4675162d4119e24dff10136 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 00:03:43 +0000
Subject: [PATCH 20/23] Drop PaddedLinearLayout

---
 .../TritonGPU/IR/LinearLayoutConversions.h    |  5 ----
 .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td |  4 ++-
 include/triton/Tools/LinearLayout.h           | 30 -------------------
 lib/Conversion/TritonGPUToLLVM/Utility.cpp    | 28 +++++++++++------
 lib/Dialect/TritonGPU/IR/Dialect.cpp          |  9 ------
 .../TritonGPU/IR/LinearLayoutConversions.cpp  |  8 -----
 lib/Tools/LinearLayout.cpp                    | 17 -----------
 .../TritonGPU/LinearLayoutConversionsTest.cpp | 15 ----------
 8 files changed, 22 insertions(+), 94 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
index d7ef07bc5bff..e458d425be39 100644
--- a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
+++ b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -298,10 +298,5 @@ LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
 // the two can be done using transferWithinWarp, without involving LDS
 std::optional<LinearLayout> chooseMfmaLikeStoreLayout(RankedTensorType valType);
 
-// Convert the given layout to a linear layout with potential additional
-// physical memory paddings.
-PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
-                                        Attribute layout);
-
 } // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index ff3340d9f79b..e708194349d4 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -456,7 +456,9 @@ attributes too, for example,
     unsigned getRank() const { return getOrder().size(); }
     int32_t getAlignment() const { return 16; }
 
-    PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape) const;
+    unsigned getMinInterval() const {
+      return *llvm::min_element(getIntervals());
+    }
 
     // Returns the total number of elements including padding given the input
     // tensor shape.
diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index cdae51375083..30735db0a308 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -840,36 +840,6 @@ class ColumnAction {
   std::string toString() const;
 };
 
-// A utility class to describe a particular padding schema with 1) a linear
-// layout to desribe the N-D logical element mapping and 2) a list of
-// interval-padding pairs to describe the 1-D physical shared memory padding
-// schedules. In a degenerated cases, we can have no interval-padding pairs so
-// it will just be a normal linear layout.
-//
-// In Triton we use linear layout basically throughout; so this class is useful
-// to leverage common linear layout facilities and code paths as much as
-// possible, while factoring in shared memory padding wherever necessary--only
-// at the very final steps when we allocating the physical shared memory or
-// creating pointers indexing into them. All steps before can still reason with
-// linear layout. Thus this utility class keeps these two parts separate.
-class PaddedLinearLayout {
-public:
-  PaddedLinearLayout(LinearLayout linear, ArrayRef<unsigned> intervals,
-                     ArrayRef<unsigned> paddings);
-
-  const LinearLayout &getLinear() const { return linear; }
-
-  // Returns the minimal interval that would trigger padding.
-  std::optional<int32_t> getMinInterval() const;
-
-  // Returns true if this is not a degenerated case and indeed requires padding.
-  bool hasPadding() const;
-
-private:
-  LinearLayout linear;
-  SmallVector<std::pair<unsigned, unsigned>> intervalPads;
-};
-
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LINEARLAYOUT_H
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 451080607efb..170fcdddec86 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -7,6 +7,7 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -509,10 +510,17 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kWarp = str_attr("warp");
 
   auto shape = sharedTy.getShape();
-  PaddedLinearLayout paddedLayout =
-      triton::gpu::toPaddedLinearLayout(shape, sharedTy.getEncoding());
-  LinearLayout regToSharedLayout =
-      regLayout.invertAndCompose(paddedLayout.getLinear());
+  auto paddedLayout =
+      dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
+  auto sharedLL = LinearLayout::empty();
+  if (paddedLayout) {
+    SmallVector<unsigned> dims(shape);
+    sharedLL = identityStandardND(Builder(ctx).getStringAttr("offset"), dims,
+                                  paddedLayout.getOrder());
+  } else {
+    sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
+  }
+  LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLL);
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.
@@ -537,10 +545,12 @@ bool emitTransferBetweenRegistersAndShared(
   //
   // It's OK if the vector width we choose here is wider than the hardware
   // supports; LLVM will legalize it.
-  const int vecElems = std::min(
-      {regToSharedLayout.getNumConsecutiveInOut(),
-       paddedLayout.getMinInterval().value_or(std::numeric_limits<int>::max()),
-       maxVecElems.value_or(std::numeric_limits<int>::max())});
+  int vecElems =
+      std::min({regToSharedLayout.getNumConsecutiveInOut(),
+                maxVecElems.value_or(std::numeric_limits<int>::max())});
+  if (paddedLayout) {
+    vecElems = std::min(vecElems, int(paddedLayout.getMinInterval()));
+  }
 
   auto withCTAOffset = triton::gpu::getNumCTAs(sharedTy.getEncoding()) > 1;
   Value blockId =
@@ -555,7 +565,7 @@ bool emitTransferBetweenRegistersAndShared(
   // Thus we use `pseudoinvert` instead of `invert` here for simplicity.
   auto allocShape = sharedTy.getAllocShape();
   auto invertAllocSharedLayout = LinearLayout::empty();
-  if (!paddedLayout.hasPadding()) {
+  if (!paddedLayout) {
     // For now this is only needed for the cases where we have swizzling.
     invertAllocSharedLayout =
         triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index d64b49790ea0..fd023bd9da7d 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1723,15 +1723,6 @@ PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef<int64_t> shape,
   return get(context, {{innerD, threadNumBytes}}, order, ctaLayout);
 }
 
-PaddedLinearLayout
-PaddedSharedEncodingAttr::toPaddedLinearLayout(ArrayRef<int64_t> shape) const {
-  auto nonSwizzleAttr = SwizzledSharedEncodingAttr::get(
-      getContext(), /*vec=*/1, /*perPhase=*/1, /*maxPhase=*/1, getOrder(),
-      getCTALayout());
-  LinearLayout ll = toLinearLayout(shape, nonSwizzleAttr);
-  return PaddedLinearLayout(ll, getIntervals(), getPaddings());
-}
-
 int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
   int64_t unpaddedSize = product(shape);
   int64_t paddingSize = 0;
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
index 38d0a11effbb..f06526a1a63b 100644
--- a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
+++ b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1120,14 +1120,6 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout) {
                                                                    layout);
 }
 
-PaddedLinearLayout toPaddedLinearLayout(ArrayRef<int64_t> shape,
-                                        Attribute layout) {
-  if (auto paddedLayout = dyn_cast<PaddedSharedEncodingAttr>(layout))
-    return paddedLayout.toPaddedLinearLayout(shape);
-  auto ll = toLinearLayout(shape, layout);
-  return PaddedLinearLayout(ll, /*intervals=*/{}, /*paddings=*/{});
-}
-
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout) {
   assert(!layout.getInDimNames().empty());
   MLIRContext *ctx = layout.getInDimNames().begin()->getContext();
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 35ae36cfa98e..d65859e2a1ad 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1337,21 +1337,4 @@ std::string ColumnAction::toString() const {
   return ret;
 }
 
-PaddedLinearLayout::PaddedLinearLayout(LinearLayout linear,
-                                       ArrayRef<unsigned> intervals,
-                                       ArrayRef<unsigned> paddings)
-    : linear(std::move(linear)) {
-  intervalPads.reserve(intervals.size());
-  for (auto [i, p] : llvm::zip_equal(intervals, paddings))
-    intervalPads.emplace_back(i, p);
-}
-
-std::optional<int32_t> PaddedLinearLayout::getMinInterval() const {
-  if (intervalPads.empty())
-    return std::nullopt;
-  return *llvm::min_element(llvm::make_first_range(intervalPads));
-}
-
-bool PaddedLinearLayout::hasPadding() const { return !intervalPads.empty(); }
-
 } // namespace mlir::triton
diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index a369feb665f2..3e40a3970b47 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3012,21 +3012,6 @@ TEST_F(LinearLayoutConversionsTest, MMAv5Fp4Padded) {
                     {S("dim0"), S("dim1")}));
 }
 
-TEST_F(LinearLayoutConversionsTest, PaddedShared) {
-  PaddedLinearLayout pll =
-      toPaddedLinearLayout({32, 64}, paddedShared({128, 256}, {4, 8}, {1, 0},
-                                                  {1, 1}, {1, 1}, {1, 0}));
-  // The expected linear layout mapping part should just be an identity.
-  auto expectedLL = LinearLayout(
-      // clang-format off
-      {{S("offset"), {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {0, 32},
-                     {1, 0}, {2, 0}, {4, 0}, {8, 0}, {16, 0}}},
-       {S("block"), {}}},
-      {S("dim0"), S("dim1")});
-  // clang-format on
-  EXPECT_EQ(pll.getLinear(), expectedLL);
-}
-
 } // anonymous namespace
 } // namespace mlir::triton::gpu
 

From e176ed37ea8259e7cf1c971c986d27a5197d6775 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Thu, 19 Jun 2025 22:08:46 +0000
Subject: [PATCH 21/23] Use reshapeOuts

---
 lib/Conversion/TritonGPUToLLVM/Utility.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
index 170fcdddec86..aa495599081e 100644
--- a/lib/Conversion/TritonGPUToLLVM/Utility.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -7,7 +7,7 @@
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
-#include "triton/Tools/LayoutUtils.h"
+#include "triton/Tools/LinearLayout.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -508,19 +508,19 @@ bool emitTransferBetweenRegistersAndShared(
   StringAttr kRegister = str_attr("register");
   StringAttr kLane = str_attr("lane");
   StringAttr kWarp = str_attr("warp");
+  StringAttr kOffset = str_attr("offset");
 
   auto shape = sharedTy.getShape();
   auto paddedLayout =
       dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
-  auto sharedLL = LinearLayout::empty();
+  LinearLayout regToSharedLayout = LinearLayout::empty();
   if (paddedLayout) {
-    SmallVector<unsigned> dims(shape);
-    sharedLL = identityStandardND(Builder(ctx).getStringAttr("offset"), dims,
-                                  paddedLayout.getOrder());
+    regToSharedLayout =
+        regLayout.reshapeOuts({{kOffset, regLayout.getTotalOutDimSize()}});
   } else {
-    sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
+    auto sharedLL = triton::gpu::toLinearLayout(shape, sharedTy.getEncoding());
+    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
   }
-  LinearLayout regToSharedLayout = regLayout.invertAndCompose(sharedLL);
 
   // TODO(jlebar): We don't currently support loading from shared memory in a
   // different CTA.  We'd need to emit `mapa.shared::cluster` instructions.

From 25221f410334b726fb251549af70a293363c99e5 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Fri, 20 Jun 2025 15:10:36 +0000
Subject: [PATCH 22/23] Drop a builder for now

---
 .../triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td   |  6 ------
 lib/Dialect/TritonGPU/IR/Dialect.cpp                   | 10 ----------
 2 files changed, 16 deletions(-)

diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
index e708194349d4..a8bdf8a54f6d 100644
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -444,12 +444,6 @@ attributes too, for example,
   let builders = [
       AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
                        "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
-      // Defines a padded shared layout that avoids bank conflicts for a block
-      // with given |shape| and |order| that is loaded by dot operand with
-      // kWidth |dotKWidth|.
-      AttrBuilder<(ins "ArrayRef<int64_t>":$shape, "ArrayRef<unsigned>":$order,
-                       "unsigned":$dotKWidth, "unsigned":$elemBitWidth,
-                       "CTALayoutAttr":$ctaLayout)>,
   ];
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
index fe99d70db412..6719783b6581 100644
--- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1730,16 +1730,6 @@ PaddedSharedEncodingAttr PaddedSharedEncodingAttr::get(
   return get(context, intervals, paddings, order, ctaLayout);
 }
 
-PaddedSharedEncodingAttr
-PaddedSharedEncodingAttr::get(MLIRContext *context, ArrayRef<int64_t> shape,
-                              ArrayRef<unsigned> order, unsigned dotKWidth,
-                              unsigned elemBitWidth, CTALayoutAttr ctaLayout) {
-  unsigned innerD = getShapePerCTA(ctaLayout.getCTASplitNum(), shape)[order[0]];
-  unsigned threadNumBytes = std::max(dotKWidth * elemBitWidth / 8u, 1u);
-  threadNumBytes = llvm::alignTo(threadNumBytes, 4); // Assume 32-bit per bank
-  return get(context, {{innerD, threadNumBytes}}, order, ctaLayout);
-}
-
 int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
   int64_t unpaddedSize = product(shape);
   int64_t paddingSize = 0;

From c068a7fd3b944e082985b66c5b5bcf61d2654b27 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Fri, 20 Jun 2025 15:17:01 +0000
Subject: [PATCH 23/23] Drop not used code

---
 .../Dialect/TritonGPU/LinearLayoutConversionsTest.cpp  | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
index 3e40a3970b47..dbab14c1091e 100644
--- a/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
+++ b/unittest/Dialect/TritonGPU/LinearLayoutConversionsTest.cpp
@@ -3,7 +3,6 @@
 #include "mlir/IR/MLIRContext.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Signals.h"
@@ -100,15 +99,6 @@ class LinearLayoutConversionsTest : public ::testing::Test {
         CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd));
   }
 
-  PaddedSharedEncodingAttr
-  paddedShared(ArrayRef<unsigned> intervals, ArrayRef<unsigned> paddings,
-               ArrayRef<unsigned> ord, ArrayRef<unsigned> cpg,
-               ArrayRef<unsigned> cSplit, ArrayRef<unsigned> cOrd) {
-    return PaddedSharedEncodingAttr::get(
-        &ctx, intervals, paddings, ord,
-        CTALayoutAttr::get(&ctx, cpg, cSplit, cOrd));
-  }
-
   NVMMASharedEncodingAttr
   nvmmaShared(unsigned swizzleSizeInBytes, bool transposed,
               unsigned elementBitWidth, ArrayRef<unsigned> cpg,