From d9f5b27a8b467f31bdf6ce7f2f985513c7868f70 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Tue, 3 Dec 2024 14:16:27 +0000
Subject: [PATCH 1/5] [Layouts] Use least squares solution in invertAndCompose

In this PR, we remove the need fro a few hacks in `invertAndCompose`,
namely the need for `getInjectiveMat` which did not work in cases where
the input and the output had a different number of registers (same with
different number of blocks) and led to the implementation of hacks on
top of it like the gymnastics with `getFreeVariable`.

We now just compute the `invertAndCompose` as the matrix `X` which is
the solution to the system `AX = B`. We add enough asserts to check that
this system has at least one solution (i.e. A is surjective) and we make
explicit the heuristic we use to minimise data-movement (not consider
dimensions that are the same, and otherwise incentivise broadcasting via
choosing the solution of minimal norm).

In the future, this function would be better returning the compact form
of the system, where if a dimension is not present it's because the
conversion is uniform over that dimension, but for that we need to adapt
our lowering algorithms.
---
 .../TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp |  25 +-
 lib/Tools/LinearLayout.cpp                    | 261 ++++++++++--------
 test/Conversion/tritongpu_to_llvm.mlir        |   3 +-
 3 files changed, 155 insertions(+), 134 deletions(-)
diff --git a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
index 84d30097cb70..7e8f6b783609 100644
--- a/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -315,14 +315,10 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
       // TODO(Keren): implement warp shuffle instead of using the general
       // approach that uses shared memory
       return transferWithinBlock(op, srcLayout, dstLayout, adaptor, rewriter);
-    } else if (llvm::is_contained(dims, kRegister) ||
-               dstLayout.getInDimSize(kRegister) !=
-                   srcLayout.getInDimSize(kRegister)) {
+    } else if (llvm::is_contained(dims, kRegister)) {
       // Case 4. Transfer between values in the same thread, in which case we
       //         simply reorder the elements of adaptor.getSrc().
-      return transferWithinThread(
-          op, dstLayout.getFreeVariableMasks()[kRegister],
-          dstLayout.getInDimSize(kRegister), *conversion, adaptor, rewriter);
+      return transferWithinThread(op, *conversion, adaptor, rewriter);
     } else {
       // Cast 5. The two layouts are equivalent. We should probably remove
       // these in RemoveLayoutConversion.
@@ -332,8 +328,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
   }
 
   LogicalResult
-  transferWithinThread(ConvertLayoutOp op, int32_t regMasks, int32_t numRegs,
-                       const LinearLayout &conversion, OpAdaptor adaptor,
+  transferWithinThread(ConvertLayoutOp op, const LinearLayout &conversion,
+                       OpAdaptor adaptor,
                        ConversionPatternRewriter &rewriter) const {
     MLIRContext *ctx = op.getContext();
     auto loc = op.getLoc();
@@ -343,16 +339,9 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getType();
     auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
-    SmallVector<Value> outVals(numRegs);
-    for (int i = 0; i < numRegs; i++) {
-      // Remove free masks from the register index
-      // For example, if idx = 0b00111, and masks = 0b00100, then we get
-      // 0b00011. It means that register 7 (0b111) has the same value as
-      // register 3 (0b011).
-      auto idx = i & (~regMasks);
-      auto srcIdx = conversion.hasInDim(kRegister)
-                        ? conversion.apply({{kRegister, idx}}).begin()->second
-                        : idx;
+    SmallVector<Value> outVals(conversion.getInDimSize(kRegister));
+    for (int i = 0; i < outVals.size(); i++) {
+      auto srcIdx = conversion.apply({{kRegister, i}}).begin()->second;
       outVals[i] = inVals[srcIdx];
     }
     Value result = packLLElements(loc, getTypeConverter(), outVals, rewriter,
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 3a81231ac863..689e7a50d75a 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -112,30 +112,6 @@ std::unique_ptr<uint64_t[]> getMatrix(const LinearLayout &layout) {
   return m;
 }
 
-// Get a matrix for `layout` with its codomain expanded so it's injective, i.e.
-// each input element maps to a unique output element.  We do this by finding
-// columns that are equal to 0 and adding a new row with a 1 in that column.
-std::tuple<std::unique_ptr<uint64_t[]>, int /*numRows*/, int /*numCols*/>
-getInjectiveMat(const LinearLayout &layout) {
-  int numRows = layout.getTotalOutDimSizeLog2();
-  int numCols = layout.getTotalInDimSizeLog2();
-  std::unique_ptr<uint64_t[]> mat = getMatrix(layout);
-
-  // Bits of mat or-reduced along the columns (so there's just one row).
-  uint64_t colBits = 0;
-  for (int r = 0; r < numRows; r++) {
-    colBits |= mat[r];
-  }
-  auto expanded = std::unique_ptr<uint64_t[]>(new uint64_t[numRows + numCols]);
-  std::memcpy(expanded.get(), mat.get(), numRows * sizeof(uint64_t));
-  for (int c = 0; c < numCols; c++) {
-    if ((colBits & (1 << c)) == 0) {
-      expanded[numRows++] = (1 << c);
-    }
-  }
-  return std::make_tuple(std::move(expanded), numRows, numCols);
-}
-
 // Compute the rank of the matrix formed by taking the bases for the given
 // outDim as columns.  In other words, finds the number of linearly-independent
 // bases for this output dimension.
@@ -804,118 +780,175 @@ LinearLayout LinearLayout::compose(const LinearLayout &outer) const {
                       compositionIsSurjective);
 }
 
-LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
-  assertDimsEqualIgnoringOrder(getOutDimNames(), outer.getOutDimNames());
-  for (StringAttr outDim : getOutDimNames()) {
-    assert(getOutDimSize(outDim) <= outer.getOutDimSize(outDim));
+namespace {
+std::unique_ptr<uint64_t[]> concatMatrices(const LinearLayout &A,
+                                           const LinearLayout &B) {
+  // In plain words, "convert_layout does not change the shape of a tensor"
+  assert(A.getTotalOutDimSizeLog2() == B.getTotalOutDimSizeLog2() &&
+         "Matrices must have the same number of output dimensions");
+  int numRows = A.getTotalOutDimSizeLog2();
+  int numCols = A.getTotalInDimSizeLog2() + B.getTotalInDimSizeLog2();
+  assert(numCols <= 64 && "LinearLayout too large");
+
+  auto concat = getMatrix(A);
+  auto Bmat = getMatrix(B);
+  auto numColsA = A.getTotalInDimSizeLog2();
+  for (int r = 0; r < numRows; r++) {
+    concat[r] |= Bmat[r] << numColsA;
   }
-  assert(outer.isSurjective());
+  return concat;
+}
 
-  // Make both `this` and `outer` injective.  We need to do this on the
-  // `outer` layout because we can't invert a non-injective function.  We
-  // choose to do so on the `this` layout as well.  The rest of the comment
-  // explains why we make that choice.
-  //
-  // Recall from the header that C = A.invertAndCompose(B) just means that
-  //  A(x) = B(C(x)).
-  //
-  // Sometimes we may have a choice of multiple values for a particular
-  // C(x). For example, if A(1) = B(0) = B(1) = 0, then C(1) can be either 0
-  // or 1.
-  //
-  // We want to choose C such that C(x) != 0 where possible.  For example,
-  // suppose we are transferring from registers to registers and we have the
-  // following layouts.
-  //
-  //   A(thread=1, block=0) = 1
-  //   A(thread=2, block=0) = 2
-  //   A(thread=0, block=1) = 0
-  //
-  //   B(thread=1, block=0) = 2
-  //   B(thread=2, block=0) = 1
-  //   B(thread=0, block=1) = 0
-  //
-  // Notice that A and B both have the same data in each of their two
-  // blocks. So if we want to transfer from A to B, we don't need to cross
-  // blocks, which is expensive.  We want A.invertAndCompose(B) to reflect
-  // that choice.
-  //
-  // Let A' be A with the last line changed to "=4", and similarly for B'.
-  // When transferring from A' to B', we can't cross blocks even if we wanted
-  // to, because the two blocks now have different data.  But also, any
-  // mapping of thread+block from A' to B' is also valid for mapping from A
-  // to B.
-  //
-  // Thus making A and B injective encodes our desire not to cross blocks,
-  // or more generally our desire that C(x) != 0 where possible.
-  auto [matThis, numRowsThis, numColsThis] = getInjectiveMat(*this);
-  auto [matOuter, numRowsOuter, numColsOuter] = getInjectiveMat(
-      outer.transposeOuts(llvm::to_vector(this->getOutDimNames())));
-
-  // Concatenate `matOuter` and `matThis` horizontally (i.e. `matThis`
-  // is to the right of `matOuter`).
-  int combinedNumRows = std::max(numRowsThis, numRowsOuter);
-  int combinedNumCols = numColsThis + numColsOuter;
-  assert(combinedNumCols <= 64 && "Can't handle huge layouts");
-
-  std::unique_ptr<uint64_t[]> m(new uint64_t[combinedNumRows]());
-  for (int r = 0; r < numRowsOuter; r++) {
-    m[r] = matOuter[r];
-  }
-  for (int r = 0; r < numRowsThis; r++) {
-    m[r] |= matThis[r] << numColsOuter;
-  }
-
-  // Perform Gaussian elimination on `m`.  Because `outer` was modified to
-  // be bijective, the first half of the matrix should be the identity
-  // matrix.  The remaining half are the bases for the combined
-  // transformation.
-  //
-  // `stride` is specified in number of 64-bit words per row, and we pack
-  // our matrix so that there's only one uint64_t per row.
-  f2reduce::inplace_rref_strided(m.get(), combinedNumRows, combinedNumCols,
+LinearLayout lstsq(const LinearLayout &A, const LinearLayout &B) {
+  // Solve the least square system AX = B for A = outer, B = *this
+  // and return the least square solution X of minimal norm
+  // A and B may not be surjective, but they have the same image
+  int numRows = A.getTotalOutDimSizeLog2();
+  int numColsA = A.getTotalInDimSizeLog2();
+  int numColsB = B.getTotalInDimSizeLog2();
+  int numCols = numColsA + numColsB;
+  std::unique_ptr<uint64_t[]> combinedMat = concatMatrices(A, B);
+  f2reduce::inplace_rref_strided(combinedMat.get(), numRows, numCols,
                                  /*stride=*/1);
 
-  // Check that the first half of the matrix is indeed the identity.
-  for (int r = 0; r < std::min(numRowsOuter, numColsOuter); r++) {
-    for (int c = 0; c < std::min(numColsOuter, numRowsOuter); c++) {
-      if (((m[r] >> c) & 1) != (r == c ? 1 : 0)) {
-        llvm::report_fatal_error("First half of the matrix was not the "
-                                 "identity, bug in invertAndCompose");
-      }
+  // Compute the pivot columns
+  // Since A and B have the same image, each row will either have a pivot
+  // or will be all zeros
+  SmallVector<int32_t> pivotCols;
+  for (int r = 0; r < numRows; r++) {
+    auto row = combinedMat[r];
+    if (row == 0) {
+      continue;
     }
+    int c = __builtin_ctzll(combinedMat[r]);
+    assert(c < numColsA && "Precondition broken. Im(A) != Im(B)");
+    assert(pivotCols.empty() ||
+           pivotCols.back() < c && "Pivot columns are not in increasing order");
+    pivotCols.push_back(c);
+  }
+
+  // Extract A^{-1}B and complete the mapping using zeros for the unnecessary
+  // fromDims
+  std::unique_ptr<uint64_t[]> retMat(new uint64_t[numColsA]());
+  int j = 0;
+  for (int r = 0; r < numColsA; r++) {
+    auto isPivot = j < pivotCols.size() && pivotCols[j] == r;
+    retMat[r] = isPivot ? combinedMat[j++] >> numColsA : 0;
   }
 
   // We need names for the in/out dim of the flattened layout we're going to
   // read off from `m`.  These could be anything, doesn't matter.
-  StringAttr inDim1D = *getInDimNames().begin();
-  StringAttr outDim1D = *getOutDimNames().begin();
+  StringAttr inDim1D = *A.getInDimNames().begin();
+  StringAttr outDim1D = *A.getOutDimNames().begin();
 
   // Read off the new bases.  These are for a flattened 1D -> 1D
-  // transformation from `this`'s in-dims to `outer`'s in-dims.
-  BasesT newBases;
-  auto &bs = newBases[inDim1D];
-  for (int c = 0; c < numColsThis; c++) {
+  LinearLayout::BasesT retBases;
+  auto &bs = retBases[inDim1D];
+  for (int c = 0; c < numColsB; c++) {
     int32_t basis = 0;
-    for (int r = 0; r < numRowsOuter; r++) {
-      basis |= (m[r] >> (numColsOuter + c) & 1) << r;
+    for (int r = 0; r < numColsA; r++) {
+      basis |= (retMat[r] >> c & 1) << r;
     }
     bs.push_back({basis});
   }
 
-  LinearLayout flatComposed(std::move(newBases),
-                            {{outDim1D, outer.getTotalInDimSize()}},
+  LinearLayout retFlattened(std::move(retBases),
+                            {{outDim1D, A.getTotalInDimSize()}},
                             /*requireSurjective=*/false);
 
   SmallVector<std::pair<StringAttr, int32_t>> retInDims;
   SmallVector<std::pair<StringAttr, int32_t>> retOutDims;
-  for (StringAttr dim : getInDimNames()) {
-    retInDims.push_back({dim, getInDimSize(dim)});
+  for (StringAttr dim : B.getInDimNames()) {
+    retInDims.push_back({dim, B.getInDimSize(dim)});
+  }
+  for (StringAttr dim : A.getInDimNames()) {
+    retOutDims.push_back({dim, A.getInDimSize(dim)});
+  }
+  return retFlattened.reshapeIns(retInDims).reshapeOuts(retOutDims);
+}
+
+} // namespace
+
+LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
+  // TODO(Lezcano) Make friend and perhaps rename to `convertFrom`
+  // For this, we need to implement our LLVM lowerings by inverting the "from"
+  // layout, and then iterating over the elements from the "to" layout and
+  // fetching the corresponding element from the "from" layout. This exercises
+  // the broadcasting that we incentivise via choosing the minimum norm solution
+  // in lstsq.
+
+  // The order of dims does not matter. We choose to transpose outer
+  auto outDims = llvm::to_vector(getOutDimNames());
+  assertDimsEqualIgnoringOrder(outDims, outer.getOutDimNames());
+  const auto &B = *this;
+  const auto A = outer.transposeOuts(outDims);
+
+  // We'll write A^{-1} to mean the inverse or the pseudo-inverse of A
+  // We are computing A^{-1}B so from must be surjective so that
+  // it has a left inverse.
+  assert(A.isSurjective());
+
+  // Broadcasting heuristic
+  // Imagine we have two layouts with `warps = [[0, 0],  [0, 0]]`
+  // (broadcasting) on both layouts. We could map any warp to any warp in the
+  // conversion. Now, we want to map them as the identity map, to mark that
+  // nothing needs to be done there (`lstsq` would map all the warps to the
+  // zero warp, minimum norm solution). The heuristic here is as follows:
+  // - If a dimension is the same for both layouts, we want to map it as the
+  // identity
+  //   Equivalently, we don't add it to the conversion
+  // - Otherwise, we just call lstsq (i.e. map all the equivalent elements
+  //   to the same input element) to take advantage of broadcasting in shared
+  //   memory and avoid saving repeated elements in shared memory
+  SmallVector<StringAttr> identityDims;
+  for (auto dim : A.getInDimNames()) {
+    if (B.hasInDim(dim) &&
+        A.sublayout(dim, outDims) == B.sublayout(dim, outDims)) {
+      identityDims.push_back(dim);
+    }
   }
-  for (StringAttr dim : outer.getInDimNames()) {
-    retOutDims.push_back({dim, outer.getInDimSize(dim)});
+  SmallVector<StringAttr> ANonIdentityInDims;
+  SmallVector<StringAttr> BNonIdentityInDims;
+  for (auto dim : A.getInDimNames()) {
+    if (!llvm::is_contained(identityDims, dim)) {
+      ANonIdentityInDims.push_back(dim);
+    }
+  }
+  for (auto dim : B.getInDimNames()) {
+    if (!llvm::is_contained(identityDims, dim)) {
+      BNonIdentityInDims.push_back(dim);
+    }
   }
-  return flatComposed.reshapeIns(retInDims).reshapeOuts(retOutDims);
+
+  auto AReduced = A.sublayout(ANonIdentityInDims, outDims);
+  auto BReduced = B.sublayout(BNonIdentityInDims, outDims);
+
+  // If one is empty, the other must be empty as well
+  assert((AReduced == LinearLayout::empty()) ==
+         (BReduced == LinearLayout::empty()));
+  bool isEmpty = AReduced == LinearLayout::empty();
+
+  auto ret = isEmpty ? LinearLayout::empty() : lstsq(AReduced, BReduced);
+
+  // TODO(Lezcano): We should return the reduced layout instead of re-adding the
+  // identity maps. With this, we'll be able to kill `minimalCvtLayout`
+
+  // Add the identity maps for the dimensions that are the same for both layouts
+  for (auto dim : identityDims) {
+    ret *= LinearLayout::identity1D(A.getInDimSize(dim), dim, dim);
+  }
+
+  // Reshape the result
+  SmallVector<std::pair<StringAttr, int32_t>> inDimsA;
+  SmallVector<std::pair<StringAttr, int32_t>> inDimsB;
+  for (auto dim : A.getInDimNames()) {
+    inDimsA.push_back({dim, A.getInDimSize(dim)});
+  }
+  for (auto dim : B.getInDimNames()) {
+    inDimsB.push_back({dim, B.getInDimSize(dim)});
+  }
+  ret = ret.reshapeIns(inDimsB).reshapeOuts(inDimsA);
+  return ret;
 }
 
 llvm::MapVector<StringAttr, int32_t>
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index 61911356af65..a97ac476cbad 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1698,8 +1698,7 @@ module attributes {"ttg.target" = "cuda:75", "ttg.num-ctas" = 1 : i32, "ttg.num-
   // CHECK-LABEL: convert_single_element
   // CHECK-NOT: llvm.store
   // CHECK-NOT: llvm.load
-  // CHECK: llvm.insertvalue
-  // CHECK: llvm.extractvalue
+  // CHECK: llvm.return
   tt.func public @convert_single_element() attributes {noinline = false} {
     %cst = arith.constant dense<1.000000e+03> : tensor<1xf32, #blocked1>
     %0 = ttg.convert_layout %cst : tensor<1xf32, #blocked1> -> tensor<1xf32, #blocked>

From 3b7d6906d44a93323bd25c897f4c30d7cfe8dbaf Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Tue, 3 Dec 2024 18:13:12 +0000
Subject: [PATCH 2/5] Add precondition

---
 lib/Tools/LinearLayout.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 689e7a50d75a..a7bf77eaba85 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -882,6 +882,10 @@ LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
   assertDimsEqualIgnoringOrder(outDims, outer.getOutDimNames());
   const auto &B = *this;
   const auto A = outer.transposeOuts(outDims);
+  for (auto dim : outDims) {
+    assert(A.getOutDimSize(dim) == B.getOutDimSize(dim) &&
+           "Convert layout does not change the shape of a tensor");
+  }
 
   // We'll write A^{-1} to mean the inverse or the pseudo-inverse of A
   // We are computing A^{-1}B so from must be surjective so that

From d698e5a8610f503ce72cffcd3b37371210ae72f4 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Tue, 3 Dec 2024 22:59:49 +0000
Subject: [PATCH 3/5] Remove free reg workaround

---
 lib/Analysis/Utility.cpp | 38 ++------------------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index e6d921464bc9..3a8be9ee3347 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -663,42 +663,8 @@ std::optional<LinearLayout> minimalCvtLayout(RankedTensorType srcTy,
   StringAttr kLane = StringAttr::get(ctx, "lane");
   StringAttr kWarp = StringAttr::get(ctx, "warp");
   StringAttr kBlock = StringAttr::get(ctx, "block");
-  auto numSrcRegs = srcLayout->getInDimSize(kRegister);
-  auto numDstRegs = dstLayout->getInDimSize(kRegister);
-  // The `invertAndCompose` function will generate a layout that is injective
-  // by assigning new output dimensions to free variables.  For instance,
-  // consider a scenario where `srcLayout` has a free variable in the lane
-  // dimension, while `dstLayout` has two free variables in the lane
-  // dimension and also a larger number of registers.
-  // The injective form of `srcLayout` will add only a single additional row
-  // to the transformation matrix, whereas the injective form of `dstLayout`
-  // will add two additional rows.  This discrepancy causes misleading results
-  // because the matrices end up with a different number of rows.
-  //
-  // Take `dstLayout ⋅ srcLayout^-1` as an example:
-  //
-  //   - `injective(dstLayout)`: [n, m] → [n + 2, m]
-  //   - `injective(srcLayout)`: [n, m] → [n + 1, m]
-  //   - `injective(srcLayout)^-1`: [n + 1, m] → [m, n + 1]
-  //   - `injective(dstLayout) ⋅ injective(srcLayout)^-1`: [n + 2, m] ⋅ [m, n +
-  //   1] → [n + 2, n + 1]
-  //
-  // Here, the `(n + 1)`-th row added by `dstLayout` represents the free
-  // variable in registers, and the `(n + 2)`-th row represents the free
-  // variable in lanes.  However, the `(n + 1)`-th row added by `srcLayout`
-  // represents the free variable in lanes.  As a result, the `(n + 1)`-th row
-  // in two layouts do not correspond to the same free variable.
-  //
-  // To address this issue, we pad the free variables in `srcLayout` and
-  // `dstLayout` to ensure they have the same number of registers.  This
-  // guarantees that the resulting matrices have the same number of rows,
-  // ensuring consistency in the composition process.
-  auto numRegs = std::max(numSrcRegs, numDstRegs);
-  auto srcLayoutWithFreeRegs = srcLayout->resize(kRegister, numRegs);
-  auto dstLayoutWithFreeRegs = dstLayout->resize(kRegister, numRegs);
-  // comp describes the layout function to create dst from src.
-  LinearLayout comp =
-      dstLayoutWithFreeRegs.invertAndCompose(srcLayoutWithFreeRegs);
+
+  auto comp = dstLayout->invertAndCompose(*srcLayout);
   // We try to quotient by the largest subspace first
   auto dims = SmallVector<StringRef>{"block", "warp", "lane", "register"};
   for (auto dim : dims) {

From 4ae087e5e06f9e46b41d27cebcf9aa718f27be58 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Tue, 3 Dec 2024 23:00:29 +0000
Subject: [PATCH 4/5] remove resize as it's not needed anymore

---
 include/triton/Tools/LinearLayout.h |  7 ------
 lib/Tools/LinearLayout.cpp          | 15 -------------
 unittest/Tools/LinearLayoutTest.cpp | 34 -----------------------------
 3 files changed, 56 deletions(-)

diff --git a/include/triton/Tools/LinearLayout.h b/include/triton/Tools/LinearLayout.h
index cfc4c0d13bbe..9ddec8881269 100644
--- a/include/triton/Tools/LinearLayout.h
+++ b/include/triton/Tools/LinearLayout.h
@@ -681,13 +681,6 @@ class LinearLayout {
   // (i.e. every input bit affects the output).
   llvm::MapVector<StringAttr, int32_t> getFreeVariableMasks() const;
 
-  // Increase an input dimension without affecting the output dimension.  The
-  // added free variables are mapped to 0, ensuring that the new input
-  // dimensions correspond directly to the existing output space.  The function
-  // errors out if `newInDimSize` is less than the current size or the new size
-  // is not a power of 2.
-  LinearLayout resize(StringAttr inDim, int32_t newInDimSize) const;
-
   std::string toString() const;
 
   friend bool operator==(LinearLayout lhs, LinearLayout rhs);
diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index a7bf77eaba85..442b49735559 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -1041,21 +1041,6 @@ bool LinearLayout::equalIgnoringOutDimSizes(const LinearLayout &other) const {
   return true;
 }
 
-LinearLayout LinearLayout::resize(StringAttr inDim,
-                                  int32_t newInDimSize) const {
-  BasesT bases = getBases();
-  assert(bases.contains(inDim) && "inDim not in layout");
-  assert(llvm::isPowerOf2_32(newInDimSize) &&
-         "newInDimSize must be a power of 2");
-  assert(newInDimSize >= getInDimSize(inDim) &&
-         "newInDimSize must be >= old size");
-  auto numFreeVariables = llvm::Log2_32(newInDimSize) - getInDimSizeLog2(inDim);
-  for (int i = 0; i < numFreeVariables; i++) {
-    bases[inDim].push_back(std::vector<int32_t>(getNumOutDims(), 0));
-  }
-  return LinearLayout(std::move(bases), llvm::to_vector(getOutDimNames()));
-}
-
 std::string LinearLayout::toString() const {
   // Start with a newline because we print out a bulleted list; it doesn't
   // make sense for the first line of this list to be on the same line as
diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp
index 897172fd6d34..e08e111700a2 100644
--- a/unittest/Tools/LinearLayoutTest.cpp
+++ b/unittest/Tools/LinearLayoutTest.cpp
@@ -746,40 +746,6 @@ TEST_F(LinearLayoutTest, QuotientIdentityMultipleDimensions) {
   ASSERT_TRUE(quotientLayout.has_value());
   ASSERT_TRUE(quotientLayout->quotient({S("dim2")}).has_value());
 }
-
-TEST_F(LinearLayoutTest, Resize) {
-  auto init = LinearLayout(
-      {
-          {S("in0"), {{0, 1}, {0, 2}}},
-          {S("in1"), {{1, 0}, {2, 0}}},
-          {S("in2"), {}},
-      },
-      {S("dim0"), S("dim1")});
-  EXPECT_EQ(init.resize(S("in0"), 8),
-            LinearLayout(
-                {
-                    {S("in0"), {{0, 1}, {0, 2}, {0, 0}}},
-                    {S("in1"), {{1, 0}, {2, 0}}},
-                    {S("in2"), {}},
-                },
-                {S("dim0"), S("dim1")}));
-  EXPECT_EQ(init.resize(S("in0"), 4), LinearLayout(
-                                          {
-                                              {S("in0"), {{0, 1}, {0, 2}}},
-                                              {S("in1"), {{1, 0}, {2, 0}}},
-                                              {S("in2"), {}},
-                                          },
-                                          {S("dim0"), S("dim1")}));
-  EXPECT_EQ(init.resize(S("in1"), 8),
-            LinearLayout(
-                {
-                    {S("in0"), {{0, 1}, {0, 2}}},
-                    {S("in1"), {{1, 0}, {2, 0}, {0, 0}}},
-                    {S("in2"), {}},
-                },
-                {S("dim0"), S("dim1")}));
-}
-
 } // anonymous namespace
 } // namespace mlir::triton
 

From 8bee251b197ed218ce2a7ce83838c0124a137152 Mon Sep 17 00:00:00 2001
From: lezcano <lezcano-93@hotmail.com>
Date: Thu, 5 Dec 2024 17:09:19 +0000
Subject: [PATCH 5/5] Address review

---
 lib/Tools/LinearLayout.cpp          | 30 ++++++++++++++--------------
 unittest/Tools/LinearLayoutTest.cpp | 31 +++++++----------------------
 2 files changed, 22 insertions(+), 39 deletions(-)

diff --git a/lib/Tools/LinearLayout.cpp b/lib/Tools/LinearLayout.cpp
index 442b49735559..0ab563908a60 100644
--- a/lib/Tools/LinearLayout.cpp
+++ b/lib/Tools/LinearLayout.cpp
@@ -787,14 +787,13 @@ std::unique_ptr<uint64_t[]> concatMatrices(const LinearLayout &A,
   assert(A.getTotalOutDimSizeLog2() == B.getTotalOutDimSizeLog2() &&
          "Matrices must have the same number of output dimensions");
   int numRows = A.getTotalOutDimSizeLog2();
-  int numCols = A.getTotalInDimSizeLog2() + B.getTotalInDimSizeLog2();
-  assert(numCols <= 64 && "LinearLayout too large");
+  int numColsA = A.getTotalInDimSizeLog2();
 
+  // rref expects the lower bits to be the lower indices of the matrix
   auto concat = getMatrix(A);
-  auto Bmat = getMatrix(B);
-  auto numColsA = A.getTotalInDimSizeLog2();
+  auto BMat = getMatrix(B);
   for (int r = 0; r < numRows; r++) {
-    concat[r] |= Bmat[r] << numColsA;
+    concat[r] |= BMat[r] << numColsA;
   }
   return concat;
 }
@@ -802,7 +801,9 @@ std::unique_ptr<uint64_t[]> concatMatrices(const LinearLayout &A,
 LinearLayout lstsq(const LinearLayout &A, const LinearLayout &B) {
   // Solve the least square system AX = B for A = outer, B = *this
   // and return the least square solution X of minimal norm
-  // A and B may not be surjective, but they have the same image
+  // A and B may not be surjective, but we assume that Im(B) \subset Im(A)
+  // Sketch of the algorithm:
+  // https://github.com/triton-lang/triton/pull/5309#discussion_r1869084111
   int numRows = A.getTotalOutDimSizeLog2();
   int numColsA = A.getTotalInDimSizeLog2();
   int numColsB = B.getTotalInDimSizeLog2();
@@ -820,15 +821,14 @@ LinearLayout lstsq(const LinearLayout &A, const LinearLayout &B) {
     if (row == 0) {
       continue;
     }
-    int c = __builtin_ctzll(combinedMat[r]);
-    assert(c < numColsA && "Precondition broken. Im(A) != Im(B)");
+    int c = __builtin_ctzll(row);
+    assert(c < numColsA && "Precondition broken. Im(B) not contained in Im(A)");
     assert(pivotCols.empty() ||
            pivotCols.back() < c && "Pivot columns are not in increasing order");
     pivotCols.push_back(c);
   }
 
-  // Extract A^{-1}B and complete the mapping using zeros for the unnecessary
-  // fromDims
+  // Extract A^{-1}B and complete the matrix using zeros
   std::unique_ptr<uint64_t[]> retMat(new uint64_t[numColsA]());
   int j = 0;
   for (int r = 0; r < numColsA; r++) {
@@ -870,10 +870,10 @@ LinearLayout lstsq(const LinearLayout &A, const LinearLayout &B) {
 } // namespace
 
 LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
-  // TODO(Lezcano) Make friend and perhaps rename to `convertFrom`
-  // For this, we need to implement our LLVM lowerings by inverting the "from"
-  // layout, and then iterating over the elements from the "to" layout and
-  // fetching the corresponding element from the "from" layout. This exercises
+  // TODO(Lezcano) Make friend and perhaps rename to `convertFrom` or `lstsq`
+  // For this, we need to implement our LLVM lowerings by inverting the "outer"
+  // layout, and then iterating over the elements from the "this" layout and
+  // fetching the corresponding element from the "outer" layout. This exercises
   // the broadcasting that we incentivise via choosing the minimum norm solution
   // in lstsq.
 
@@ -888,7 +888,7 @@ LinearLayout LinearLayout::invertAndCompose(const LinearLayout &outer) const {
   }
 
   // We'll write A^{-1} to mean the inverse or the pseudo-inverse of A
-  // We are computing A^{-1}B so from must be surjective so that
+  // We are computing A^{-1}B so A must be surjective so that
   // it has a left inverse.
   assert(A.isSurjective());
 
diff --git a/unittest/Tools/LinearLayoutTest.cpp b/unittest/Tools/LinearLayoutTest.cpp
index e08e111700a2..d6b94e83f012 100644
--- a/unittest/Tools/LinearLayoutTest.cpp
+++ b/unittest/Tools/LinearLayoutTest.cpp
@@ -410,26 +410,6 @@ TEST_F(LinearLayoutTest, InvertAndCompose_NonInjective) {
   EXPECT_EQ(composition.compose(l2), l1);
 }
 
-TEST_F(LinearLayoutTest, InvertAndCompose_SmallerResult) {
-  // The domain of l2 is [0,16), but the codomain of the result is only [0,8),
-  // because there's no value v in the codomain of l1 such that l2^-1(v) >= 8.
-  LinearLayout l1({{S("in1"), {{1}, {2}, {4}}}}, {S("out")});
-  LinearLayout l2({{S("in2"), {{4}, {1}, {2}, {8}}}}, {S("out")});
-  // Pseudo-inverse of l2 is
-  //
-  //  out(1) = 2
-  //  out(2) = 4
-  //  out(4) = 1
-  //  out(8) = 8
-  //
-  // Composing with l1 gives back l2^-1 without the out(8) entry.
-  LinearLayout composition = l1.invertAndCompose(l2);
-  EXPECT_EQ(composition,
-            LinearLayout({{S("in1"), {{2}, {4}, {1}}}}, {{S("in2"), 16}},
-                         /*requireSurjective=*/false));
-  EXPECT_TRUE(composition.compose(l2).equalIgnoringOutDimSizes(l1));
-}
-
 TEST_F(LinearLayoutTest, InvertAndCompose_BroadcastedInDim) {
   LinearLayout l1({{S("in1"), {{2}, {1}, {4}}}, {S("in2"), {{0}}}}, {S("out")});
   LinearLayout l2({{S("in"), {{4}, {1}, {2}}}}, {S("out")});
@@ -514,8 +494,10 @@ TEST_F(LinearLayoutTest, InvertAndCompose_BroadcastedDims) {
   LinearLayout l1({{S("in1"), {{1}, {2}, {4}}}, {S("in2"), {{0}}}}, {S("out")});
   LinearLayout l2({{S("in3"), {{1}, {2}, {4}}}, {S("in4"), {{0}}}}, {S("out")});
   LinearLayout c = l1.invertAndCompose(l2);
-  EXPECT_EQ(c, LinearLayout::identity1D(8, S("in1"), S("in3")) *
-                   LinearLayout::identity1D(2, S("in2"), S("in4")));
+  EXPECT_EQ(c, LinearLayout(
+                   {{S("in1"), {{1, 0}, {2, 0}, {4, 0}}}, {S("in2"), {{0, 0}}}},
+                   {{S("in3"), 8}, {S("in4"), 2}},
+                   /*requireSurjective=*/false));
   EXPECT_EQ(c.compose(l2),
             l1.transposeOuts(llvm::to_vector(l2.getOutDimNames())));
 }
@@ -525,8 +507,9 @@ TEST_F(LinearLayoutTest, InvertAndCompose_BroadcastedDims2) {
   LinearLayout b({{S("in3"), {{2}, {1}}}, {S("in4"), {{0}}}}, {S("out")});
   LinearLayout c = a.invertAndCompose(b);
   EXPECT_EQ(c,
-            LinearLayout({{S("in1"), {{2, 0}, {1, 0}}}, {S("in2"), {{0, 1}}}},
-                         {S("in3"), S("in4")}));
+            LinearLayout({{S("in1"), {{2, 0}, {1, 0}}}, {S("in2"), {{0, 0}}}},
+                         {{S("in3"), 4}, {S("in4"), 2}},
+                         /*requireSurjective=*/false));
   EXPECT_EQ(c.compose(b), a.transposeOuts(llvm::to_vector(b.getOutDimNames())));
 }