From 92ad7b5960615d0aec5279159972e5b3b1f3d367 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sat, 26 Oct 2024 22:53:55 +0000
Subject: [PATCH 1/3] [AMD] NFC: Restructure ReorderInstructions pass

This commit restructures the ReorderInstructions pass to be
more modular and cleaner with utility functions and guard
rule applications against their intended usage rather than
always being globally on.
---
 .../ReorderInstructions.cpp                   | 374 ++++++++++--------
 1 file changed, 205 insertions(+), 169 deletions(-)
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index e122f15fd901..7bf357b10a93 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -7,14 +7,14 @@
 #include "mlir/Pass/PassManager.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
-#include <deque>
-
-#define GEN_PASS_CLASSES
-#include "TritonAMDGPUTransforms/Passes.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace mlir;
 namespace ttg = mlir::triton::gpu;
-namespace tt = mlir::triton;
+
+//===----------------------------------------------------------------------===//
+// Utility functions
+//===----------------------------------------------------------------------===//
 
 static bool isLocalLoadOrDotLayoutConversion(Operation *op) {
   if (isa<ttg::LocalLoadOp>(op))
@@ -61,194 +61,230 @@ findEarlyInsertionPoint(Block *block, Operation *move) {
   return ipnt;
 }
 
+// Return the first user in the same block of the given op. If the user is in a
+// nested block then return the op owning the block. Return nullptr if not
+// existing.
+static Operation *getFirstUseInSameBlock(Operation *op) {
+  SmallVector<Operation *> usersInSameBlock;
+  for (auto user : op->getUsers()) {
+    if (Operation *ancestor = op->getBlock()->findAncestorOpInBlock(*user))
+      usersInSameBlock.push_back(ancestor);
+  }
+  auto minOpIt =
+      llvm::min_element(usersInSameBlock, [](Operation *a, Operation *b) {
+        return a->isBeforeInBlock(b);
+      });
+  return minOpIt != usersInSameBlock.end() ? *minOpIt : nullptr;
+}
+
 // Check if the operation opInsideLoop is inside any scf::ForOp and
 // opOutsideLoop is not inside the same loop.
-bool isCrossLoopBoundary(mlir::Operation *opInsideLoop,
-                         mlir::Operation *opOutsideLoop) {
+static bool isCrossLoopBoundary(mlir::Operation *opInsideLoop,
+                                mlir::Operation *opOutsideLoop) {
   scf::ForOp parentForOp = opInsideLoop->getParentOfType<scf::ForOp>();
   return parentForOp && !parentForOp->isAncestor(opOutsideLoop);
 }
 
-class TritonAMDGPUReorderInstructionsPass
-    : public TritonAMDGPUReorderInstructionsBase<
-          TritonAMDGPUReorderInstructionsPass> {
-public:
-  TritonAMDGPUReorderInstructionsPass() = default;
-
-  Operation *getFirstUse(Operation *op) {
-    std::vector<Operation *> users;
-    for (auto user : op->getUsers()) {
-      if (Operation *ancestor = op->getBlock()->findAncestorOpInBlock(*user))
-        users.push_back(ancestor);
-    }
-    auto minOpIt = std::min_element(users.begin(), users.end(),
-                                    [](mlir::Operation *a, mlir::Operation *b) {
-                                      return a->isBeforeInBlock(b);
-                                    });
-    return minOpIt != users.end() ? *minOpIt : nullptr;
-  }
+//===----------------------------------------------------------------------===//
+// Reorder mechanisms
+//===----------------------------------------------------------------------===//
 
-  void runOnOperation() override {
-    ModuleOp m = getOperation();
+// Sink shared memory loads and layout conversions into loops to decrease
+// register pressure when possible.
+static void sinkLoadConversionIntoLoops(ModuleOp moduleOp) {
+  DenseMap<Operation *, Operation *> opToMove;
+  moduleOp.walk([&](Operation *op) {
+    if (!isLocalLoadOrDotLayoutConversion(op))
+      return;
+    if (!op->hasOneUse())
+      return;
+    Operation *user = *op->getUsers().begin();
+    if (user->getParentOfType<scf::ForOp>() ==
+        op->getParentOfType<scf::ForOp>())
+      return;
+    opToMove[op] = user;
+  });
 
-    // Sink shared memory loads and layout conversions into loops to decrease
-    // register pressure when possible.
-    DenseMap<Operation *, Operation *> opToMove;
-    m.walk([&](Operation *op) {
-      if (!isLocalLoadOrDotLayoutConversion(op))
-        return;
-      if (!op->hasOneUse())
-        return;
-      Operation *user = *op->getUsers().begin();
-      if (user->getParentOfType<scf::ForOp>() ==
-          op->getParentOfType<scf::ForOp>())
+  for (auto &kv : opToMove)
+    kv.first->moveBefore(kv.second);
+}
+
+// Adjust the placement of shared memory writes and reads to immediately follow
+// the definition of their operands in case where shared memory write is in the
+// loop but its operand is not. This is a heuristic for optimizing fused
+// attention by hoisting Q tensor shared memory read/write operations outside of
+// the loop, as Q is a loop invariant and can be loaded once before entering the
+// loop.
+//
+// There are two possible patterns for this adjustment depending on whether the
+// write to shared memory is performed using an optional `local_alloc` argument
+// or a `local_store` instruction.
+//
+// 1) %1 = some_op ... (typically a load or an operation that scales the tensor
+//                      after loading)
+//    %2 = local_alloc %1
+//    %3 = local_load %2
+//
+// 2) %1 = some_op ...
+//    %2 = local_alloc
+//    %3 = local_store %1, %2
+//    %4 = local_load %2
+static void hoistLocalLoad(ModuleOp moduleOp) {
+  moduleOp.walk([&](ttg::LocalLoadOp localLoad) {
+    auto localAlloc = localLoad.getSrc().getDefiningOp<ttg::LocalAllocOp>();
+    if (!localAlloc)
+      return;
+
+    // Case when localAlloc has operands
+    if (localAlloc->getNumOperands() == 1) {
+      if (!localAlloc->hasOneUse())
         return;
-      opToMove.insert({op, user});
-    });
-    for (auto &kv : opToMove)
-      kv.first->moveBefore(kv.second);
-    opToMove.clear();
-
-    // Adjust the placement of LDS writes and reads to immediately follow the
-    // definition of their operands in case where LDS write is in the
-    // loop but it's operand is not. This is a heuristic for optimizing fused
-    // attention by hoisting Q tensor LDS read/write operations outside of the
-    // loop, as Q is a loop invariant and can be loaded once before entering the
-    // loop.
-    // There are two possible patterns for this adjustment depending on
-    // whether the write to LDS is performed using an optional `local_alloc`
-    // argument or a `local_store` instruction.
-    //
-    // clang-format off
-    //
-    // 1) %1 = some_op ... (typically a load or an operation that scales the tensor after loading)
-    //    %2 = local_alloc %1
-    //    %3 = local_load %2
-    //
-    // 2) %1 = some_op ...
-    //    %2 = local_alloc
-    //    %3 = local_store %1, %2
-    //    %4 = local_load %2
-    //
-    // clang-format on
-    m.walk([&](ttg::LocalLoadOp localLoad) {
-      auto localAlloc = localLoad.getSrc().getDefiningOp<ttg::LocalAllocOp>();
-      if (!localAlloc)
+
+      auto srcTensorOp = localAlloc.getSrc().getDefiningOp();
+      // Check if localAlloc is in the loop but it's src tensor defining op is
+      // outside of it.
+      if (!srcTensorOp || !isCrossLoopBoundary(localAlloc, srcTensorOp))
         return;
 
-      // Case when localAlloc has operands
-      if (localAlloc->getNumOperands() == 1) {
-        if (!localAlloc->hasOneUse())
-          return;
+      localAlloc->moveAfter(srcTensorOp);
+      localLoad->moveAfter(localAlloc);
+      return;
+    }
 
-        auto srcTensorOp = localAlloc->getOperand(0).getDefiningOp();
-        // Check if localAlloc is in the loop but it's src tensor defining op is
-        // outside of it.
-        if (!srcTensorOp || !isCrossLoopBoundary(localAlloc, srcTensorOp)) {
-          return;
-        }
+    // Case when localAlloc has no operands
+    assert(localAlloc->getNumOperands() < 1);
+    auto allocVal = localAlloc->getResult(0);
 
-        localAlloc->moveAfter(srcTensorOp);
-        localLoad->moveAfter(localAlloc);
-        return;
-      }
+    // Check if the localAlloc has exactly two uses (localStore and localLoad)
+    int numUses = std::distance(allocVal.use_begin(), allocVal.use_end());
+    if (numUses != 2)
+      return;
 
-      // Case when localAlloc has no operands
-      assert(localAlloc->getNumOperands() < 1);
-      auto allocVal = localAlloc->getResult(0);
+    // localStore comes before localLoad in block.
+    Operation *localStore = getFirstUseInSameBlock(localAlloc);
+    if (!isa<ttg::LocalStoreOp>(localStore))
+      return;
 
-      // Check if the localAlloc has exactly two uses (localStore and localLoad)
-      int numUses = std::distance(allocVal.use_begin(), allocVal.use_end());
-      if (numUses != 2)
-        return;
+    auto srcTensorOp = localStore->getOperand(0).getDefiningOp();
+    // Check if localStore is in the loop but it's src tensor defining op is
+    // outside of it.
+    if (!srcTensorOp || !isCrossLoopBoundary(localStore, srcTensorOp)) {
+      return;
+    }
 
-      // localStore comes before localLoad in block.
-      Operation *localStore = getFirstUse(localAlloc);
-      if (!isa<ttg::LocalStoreOp>(localStore))
-        return;
+    localAlloc->moveAfter(srcTensorOp);
+    localStore->moveAfter(localAlloc);
+    localLoad->moveAfter(localStore);
+  });
+}
 
-      auto srcTensorOp = localStore->getOperand(0).getDefiningOp();
-      // Check if localStore is in the loop but it's src tensor defining op is
-      // outside of it.
-      if (!srcTensorOp || !isCrossLoopBoundary(localStore, srcTensorOp)) {
-        return;
-      }
+// Sink conversion after the last dealloc but before the first use in its block.
+// This helps to avoid unnecessary shared memory allocation.
+static void moveDownCoversion(ModuleOp moduleOp) {
+  SmallVector<ttg::ConvertLayoutOp> convertOps;
+  moduleOp.walk([&](ttg::ConvertLayoutOp op) { convertOps.push_back(op); });
 
-      localAlloc->moveAfter(srcTensorOp);
-      localStore->moveAfter(localAlloc);
-      localLoad->moveAfter(localStore);
-    });
+  for (auto op : convertOps) {
+    Operation *user = getFirstUseInSameBlock(op);
+    for (auto it = Block::iterator(op), ie = op->getBlock()->end();
+         it != ie && &*it != user; ++it)
+      if (isa<ttg::LocalDeallocOp>(&*it))
+        op->moveAfter(&*it);
+  }
+}
 
-    // Sink conversion after the last dealloc but before the first use ancestor
-    // in its block. This helps to avoid unnecessary shared memory allocation.
-    m.walk([&](triton::gpu::ConvertLayoutOp op) {
-      auto curr = mlir::Block::iterator(op);
-      for (; &*curr != getFirstUse(op); curr++)
-        if (isa<triton::gpu::LocalDeallocOp>(&*curr))
-          op->moveAfter(&*curr);
-    });
+// Move transpositions just after their definition.
+static void moveUpTranspose(ModuleOp moduleOp) {
+  SmallVector<triton::TransOp> transOps;
+  moduleOp.walk([&](triton::TransOp op) { transOps.push_back(op); });
 
-    // Move transpositions just after their definition.
-    m.walk([&](triton::TransOp op) {
-      if (Operation *argOp = op.getSrc().getDefiningOp())
-        op->moveAfter(argOp);
-    });
+  for (auto op : transOps)
+    if (Operation *argOp = op.getSrc().getDefiningOp())
+      op->moveAfter(argOp);
+}
 
-    SmallVector<Operation *> moveOps;
-    // Move global loads early to prefetch. This may increase register pressure
-    // but it enables issuing global loads early.
-    m.walk([&](triton::LoadOp op) { moveOps.push_back(op); });
-    // Move local_stores early if dependence distance greater than
-    // one iteration.
-    // Best perf on GEMM when these precede global loads.
-    m.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); });
-
-    for (auto op : llvm::reverse(moveOps)) {
-      // Gather use-def chain in block.
-      Block *block = op->getBlock();
-      bool leadsToLoad = false;
-      SetVector<Operation *> backwardSet;
-
-      BackwardSliceOptions options;
-      options.omitBlockArguments = true;
-      options.inclusive = false;
-      options.filter = [&](Operation *defOp) -> bool {
-        Block *defBlock = defOp->getBlock();
-        if (!block->findAncestorOpInBlock(*defOp))
-          return false;
-        // Check for a `load` dependent path.
-        leadsToLoad |= isa<triton::LoadOp>(defOp);
-        // Only move ops residing in the same block.
-        return defBlock == block;
-      };
-      mlir::getBackwardSlice(op, &backwardSet, options);
-      backwardSet.insert(op);
-
-      // Don't move a local_store if its source is a load from
-      // the same iteration.
-      if (isa<ttg::LocalStoreOp>(op) && leadsToLoad)
-        continue;
-
-      auto ipoint = findEarlyInsertionPoint(block, op);
-      // Remove ops that already precede the insertion point. This is done
-      // before moves happen to avoid `Operation::isBeforeInBlock` N^2
-      // complexity.
-
-      SmallVector<Operation *> dfg = backwardSet.takeVector();
-      if (ipoint != block->end()) {
-        // Move ops to insertion point.
-        llvm::erase_if(
-            dfg, [&](Operation *op) { return !ipoint->isBeforeInBlock(op); });
-        for (auto *dfgop : llvm::reverse(dfg))
-          dfgop->moveAfter(block, ipoint);
-      } else {
-        // Move ops to block begin.
-        for (auto *dfgop : llvm::reverse(dfg))
-          dfgop->moveBefore(block, block->begin());
-      }
+// Schedule global load and local store ops for better GEMM performance.
+static void scheduleGlobalLoadLocalStore(ModuleOp m) {
+  SmallVector<Operation *> moveOps;
+  // Move global loads early to prefetch. This may increase register pressure
+  // but it enables issuing global loads early.
+  m.walk([&](triton::LoadOp op) { moveOps.push_back(op); });
+  // Move local_stores early if dependence distance greater than one iteration.
+  // Best perf on GEMM when these precede global loads.
+  m.walk([&](ttg::LocalStoreOp op) { moveOps.push_back(op); });
+
+  for (auto op : llvm::reverse(moveOps)) {
+    // Gather use-def chain in block.
+    Block *block = op->getBlock();
+    bool leadsToLoad = false;
+    SetVector<Operation *> backwardSet;
+
+    BackwardSliceOptions options;
+    options.omitBlockArguments = true;
+    options.inclusive = false;
+    options.filter = [&](Operation *defOp) -> bool {
+      Block *defBlock = defOp->getBlock();
+      if (!block->findAncestorOpInBlock(*defOp))
+        return false;
+      // Check for a `load` dependent path.
+      leadsToLoad |= isa<triton::LoadOp>(defOp);
+      // Only move ops residing in the same block.
+      return defBlock == block;
+    };
+    mlir::getBackwardSlice(op, &backwardSet, options);
+    backwardSet.insert(op);
+
+    // Don't move a local_store if its source is a load from
+    // the same iteration.
+    if (isa<ttg::LocalStoreOp>(op) && leadsToLoad)
+      continue;
+
+    auto ipoint = findEarlyInsertionPoint(block, op);
+    // Remove ops that already precede the insertion point. This is done
+    // before moves happen to avoid `Operation::isBeforeInBlock` N^2
+    // complexity.
+
+    SmallVector<Operation *> dfg = backwardSet.takeVector();
+    if (ipoint != block->end()) {
+      // Move ops to insertion point.
+      llvm::erase_if(
+          dfg, [&](Operation *op) { return !ipoint->isBeforeInBlock(op); });
+      for (auto *dfgop : llvm::reverse(dfg))
+        dfgop->moveAfter(block, ipoint);
+    } else {
+      // Move ops to block begin.
+      for (auto *dfgop : llvm::reverse(dfg))
+        dfgop->moveBefore(block, block->begin());
     }
   }
+}
+
+//===----------------------------------------------------------------------===//
+// Pass definition
+//===----------------------------------------------------------------------===//
+
+#define GEN_PASS_CLASSES
+#include "TritonAMDGPUTransforms/Passes.h"
+
+namespace {
+struct TritonAMDGPUReorderInstructionsPass
+    : public TritonAMDGPUReorderInstructionsBase<
+          TritonAMDGPUReorderInstructionsPass> {
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    sinkLoadConversionIntoLoops(m);
+
+    hoistLocalLoad(m);
+
+    moveDownCoversion(m);
+
+    moveUpTranspose(m);
+
+    scheduleGlobalLoadLocalStore(m);
+  }
 };
+} // namespace
 
 std::unique_ptr<Pass> mlir::createTritonAMDGPUReorderInstructionsPass() {
   return std::make_unique<TritonAMDGPUReorderInstructionsPass>();

From 3255955ac1d5172da8fbf56da76c66e93267c159 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sat, 26 Oct 2024 23:00:54 +0000
Subject: [PATCH 2/3] Drop local_load sink logic given it's reverted in a next
 step

---
 .../ReorderInstructions.cpp                   | 22 ++++++-------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index 7bf357b10a93..8806356d9c8f 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -16,14 +16,6 @@ namespace ttg = mlir::triton::gpu;
 // Utility functions
 //===----------------------------------------------------------------------===//
 
-static bool isLocalLoadOrDotLayoutConversion(Operation *op) {
-  if (isa<ttg::LocalLoadOp>(op))
-    return true;
-  if (auto cvt = dyn_cast<ttg::ConvertLayoutOp>(op))
-    return isa<ttg::DotOperandEncodingAttr>(cvt.getType().getEncoding());
-  return false;
-}
-
 // Search through block to find earliest insertion point for move op. This can
 // be either an atomic op or last usage of source pointer. Search ends when move
 // op is encountered.
@@ -89,12 +81,13 @@ static bool isCrossLoopBoundary(mlir::Operation *opInsideLoop,
 // Reorder mechanisms
 //===----------------------------------------------------------------------===//
 
-// Sink shared memory loads and layout conversions into loops to decrease
-// register pressure when possible.
-static void sinkLoadConversionIntoLoops(ModuleOp moduleOp) {
+// Sink dot layout conversions into loops to decrease register pressure when
+// possible.
+static void sinkDotConversion(ModuleOp moduleOp) {
   DenseMap<Operation *, Operation *> opToMove;
-  moduleOp.walk([&](Operation *op) {
-    if (!isLocalLoadOrDotLayoutConversion(op))
+  moduleOp.walk([&](ttg::ConvertLayoutOp op) {
+    Attribute encoding = op.getType().getEncoding();
+    if (!isa_and_nonnull<ttg::DotOperandEncodingAttr>(encoding))
       return;
     if (!op->hasOneUse())
       return;
@@ -273,10 +266,9 @@ struct TritonAMDGPUReorderInstructionsPass
   void runOnOperation() override {
     ModuleOp m = getOperation();
 
-    sinkLoadConversionIntoLoops(m);
-
     hoistLocalLoad(m);
 
+    sinkDotConversion(m);
     moveDownCoversion(m);
 
     moveUpTranspose(m);

From c6a6a7f9b985f028d75e9a6345b87bc28ddc3570 Mon Sep 17 00:00:00 2001
From: Lei Zhang <antiagainst@gmail.com>
Date: Sat, 26 Oct 2024 23:16:21 +0000
Subject: [PATCH 3/3] Add guard for logic that appliable to matmul

---
 .../ReorderInstructions.cpp                   | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
index 8806356d9c8f..22349c50e308 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp
@@ -5,6 +5,7 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "llvm/ADT/STLExtras.h"
@@ -16,6 +17,18 @@ namespace ttg = mlir::triton::gpu;
 // Utility functions
 //===----------------------------------------------------------------------===//
 
+// Return true if the given moduleOp contains a pure matmul problem; i.e.,
+// single dot in the main loop.
+static bool isPureMatmulProblem(ModuleOp moduleOp) {
+  for (auto forOp : moduleOp.getOps<scf::ForOp>()) {
+    int counter = 0;
+    forOp.walk([&counter](triton::DotOp dotOp) { ++counter; });
+    if (counter != 1)
+      return false;
+  }
+  return true;
+}
+
 // Search through block to find earliest insertion point for move op. This can
 // be either an atomic op or last usage of source pointer. Search ends when move
 // op is encountered.
@@ -104,10 +117,12 @@ static void sinkDotConversion(ModuleOp moduleOp) {
 
 // Adjust the placement of shared memory writes and reads to immediately follow
 // the definition of their operands in case where shared memory write is in the
-// loop but its operand is not. This is a heuristic for optimizing fused
-// attention by hoisting Q tensor shared memory read/write operations outside of
-// the loop, as Q is a loop invariant and can be loaded once before entering the
-// loop.
+// loop but its operand is not.
+//
+// This is a heuristic driven by optimizing fused attention by hoisting Q tensor
+// shared memory read/write operations outside of the loop, as Q is a loop
+// invariant and can be loaded once before entering the loop. But it should be
+// generally applicable.
 //
 // There are two possible patterns for this adjustment depending on whether the
 // write to shared memory is performed using an optional `local_alloc` argument
@@ -273,7 +288,8 @@ struct TritonAMDGPUReorderInstructionsPass
 
     moveUpTranspose(m);
 
-    scheduleGlobalLoadLocalStore(m);
+    if (isPureMatmulProblem(m))
+      scheduleGlobalLoadLocalStore(m);
   }
 };
 } // namespace