From 6b93b35842c5973f3aede340b1bf6fd285cd2e0c Mon Sep 17 00:00:00 2001
From: Jevin Jiang <jevinjiang@google.com>
Date: Fri, 20 Sep 2024 15:00:00 -0700
Subject: [PATCH] [Mosaic:TPU] Efficient relayout with internal scratch

We should support all different retilings (x*packing1, 128) <-> (y*packing2, 128) with any dtype in this cl at this moment. The efficient relayout with scratch brings significant improvements on current retiling in <= TPUv4 and retiling with (packing, 128) in TPUv5. All missing retiling supports are added in this cl, including increase sublane retiling and packed type retiling.

PiperOrigin-RevId: 676982957
---
 jaxlib/mosaic/dialect/tpu/tpu.td              |   1 +
 jaxlib/mosaic/dialect/tpu/tpu_dialect.h       |   1 +
 .../tpu/transforms/apply_vector_layout.cc     | 566 ++++++++++++++----
 tests/pallas/tpu_pallas_test.py               |   6 +-
 4 files changed, 439 insertions(+), 135 deletions(-)
diff --git a/jaxlib/mosaic/dialect/tpu/tpu.td b/jaxlib/mosaic/dialect/tpu/tpu.td
index c1fba60f4cc5..ffcc8d52cd05 100644
--- a/jaxlib/mosaic/dialect/tpu/tpu.td
+++ b/jaxlib/mosaic/dialect/tpu/tpu.td
@@ -790,6 +790,7 @@ def ApplyVectorLayoutPass : Pass<"tpu-apply-vector-layout", "::mlir::func::FuncO
     Option<"mxu_contracting_size", "mxu-contracting-size", "int", /*default=*/"128", "">,
     Option<"mxu_noncontracting_size", "mxu-noncontracting-size", "int", /*default=*/"128", "">,
     Option<"max_sublanes_in_scratch", "max-sublanes-in-scratch", "int", /*default=*/"0", "">,
+    Option<"vmem_banks", "vmem-banks", "int", /*default=*/"-1", "">,
   ];
 }
 
diff --git a/jaxlib/mosaic/dialect/tpu/tpu_dialect.h b/jaxlib/mosaic/dialect/tpu/tpu_dialect.h
index 510bd384d656..00bd15b57153 100644
--- a/jaxlib/mosaic/dialect/tpu/tpu_dialect.h
+++ b/jaxlib/mosaic/dialect/tpu/tpu_dialect.h
@@ -62,6 +62,7 @@ struct ApplyVectorLayoutContext {
   // mxu_shape = {contracting_size, non_contracting_size}
   std::array<int64_t, 2> mxu_shape = {128, 128};
   int64_t max_sublanes_in_scratch = 0;
+  int64_t vmem_banks = -1;  // -1 means "unspecified".
 };
 
 std::pair<bool, bool> mightCommunicateBetweenChips(Operation* op);
diff --git a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
index 951ed59865c7..d3e1b59afe16 100644
--- a/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
+++ b/jaxlib/mosaic/dialect/tpu/transforms/apply_vector_layout.cc
@@ -11,6 +11,7 @@
 #include <optional>
 #include <tuple>
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVectorExtras.h"
@@ -46,6 +47,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -139,18 +141,21 @@ void moveAllRegions(Operation &src, Operation &dst) {
 //
 // Returns:
 //   A memref of the requested shape and type.
-FailureOr<Value> getInternalScratch(RewriteContext &ctx, OpBuilder &builder,
-                                    Location loc, ArrayRef<int64_t> shape,
-                                    Type elem_ty) {
+FailureOr<TypedValue<MemRefType>> getInternalScratch(
+    RewriteContext &ctx, OpBuilder &builder, Location loc,
+    ArrayRef<int64_t> shape, Type elem_ty, int64_t sublane_tiling = 0) {
   if (shape.empty()) {
     return failure();
   }
   if (shape.back() % ctx.target_shape[1] != 0) {
     return failure();
   }
-  int sublane_count =
+  int packing = 32 / elem_ty.getIntOrFloatBitWidth();
+  int sublane_count = llvm::divideCeil(
       std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) /
-      ctx.target_shape[1];
+          ctx.target_shape[1],
+      packing);
+
   if (sublane_count > ctx.max_sublanes_in_scratch) {
     return failure();
   }
@@ -159,7 +164,7 @@ FailureOr<Value> getInternalScratch(RewriteContext &ctx, OpBuilder &builder,
   FAILUREOR_ASSIGN_OR_RETURN(
       MemRefType scratch_ref_ty,
       inferMemref(MemRefType::get(shape, elem_ty), ctx.hardware_generation,
-                  /*tpu_tiling_flags=*/{}));
+                  /*tpu_tiling_flags=*/{}, sublane_tiling));
   return builder.create<tpu::GetInternalScratchOp>(loc, scratch_ref_ty)
       .getResult();
 }
@@ -4752,30 +4757,6 @@ xla::Array<Value> retileToReducedSublanes(
   return dst_vreg_array;
 }
 
-// Returns true iff the layout changes involve reduced sublanes per tile.
-//
-// Arguments:
-//  src: The existing layout.
-//  dst: The new layout based on which the retiling is to be carried out.
-bool isSupportedReducedSublanesRetile(
-    const VectorLayout &src, const VectorLayout &dst,
-    const std::array<int64_t, 2> target_shape) {
-  return src.implicit_dim() == dst.implicit_dim() &&
-         llvm::all_of(llvm::zip_equal(src.offsets(), dst.offsets()),
-                      [](auto tup) {
-                        auto [lhs, rhs] = tup;
-                        return lhs.value_or(0) == rhs.value_or(0);
-                      })
-         // TODO (kumudbhandari): We have not tested any tile size where
-         // tile[-1] != TARGET_SHAPE.lanes. It should work but needs to be
-         // tested.
-         && src.tiling()[1] == target_shape[1] &&
-         dst.tiling()[1] == target_shape[1] &&
-         dst.tiling()[0] < src.tiling()[0] &&
-         src.bitwidth() == dst.bitwidth() &&
-         llvm::isPowerOf2_64(src.tiling()[0]) &&
-         llvm::isPowerOf2_64(dst.tiling()[0]);
-}
 
 // Copy one sublane from a vreg to another vreg.
 //
@@ -5368,13 +5349,353 @@ FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeOffsets(
   return std::make_pair(dst, std::move(vregs));
 }
 
-// TODO(b/265133506): Generalize retiling.
+LogicalResult retileToLargeTileWithScratch(
+    RewriteContext &ctx, OpBuilder &builder, const Location loc,
+    xla::Array<Value> &dst_tiles, const std::array<int64_t, 2> &dst_tile,
+    const xla::Array<Value> &src_tiles, const std::array<int64_t, 2> &src_tile,
+    TypedValue<MemRefType> scratch_ref) {
+  if (dst_tile[0] % src_tile[0] != 0) {
+    return failure();
+  }
+  // Number of src vregs needed to assemble one dst vreg.
+  int vregs_per_group = dst_tile[0] / src_tile[0];
+  // Number of sublanes needed per src vreg to assemble one dst vreg.
+  int sl_per_vreg = ctx.target_shape[0] / vregs_per_group;
+  int stride = vregs_per_group;
+
+  xla::Array<int> sublane_offsets(
+      {ctx.target_shape[0] / dst_tile[0], src_tile[0], vregs_per_group}, 0);
+  absl::c_iota(sublane_offsets, 0);
+  // The older hardware has limited support for shuffles so even if we have bank
+  // conflicts, we just accept them and will have the lowering unroll the
+  // loads/stores.
+  bool should_handle_bank_confict =
+      ctx.hardware_generation >= 4 && ctx.vmem_banks > 0 &&
+      ctx.vmem_banks < stride * ctx.target_shape[0];
+  // Add one extra sublane to stride to avoid bank conflict.
+  if (should_handle_bank_confict) {
+    // Adjust sublane offsets to match the stride.
+    for (int i = 0; i < sublane_offsets.num_elements(); i += 1) {
+      *(sublane_offsets.begin() + i) += i / stride;
+    }
+    stride += 1;
+  }
+  sublane_offsets.TransposeDimensions({0, 2, 1});
+
+  auto mlirIndexConst = [&](int d) {
+    return builder.create<arith::ConstantOp>(
+        src_tiles.begin()->getLoc(),
+        builder.getIntegerAttr(builder.getIndexType(), d));
+  };
+  auto cst_0 = mlirIndexConst(0);
+  // Each group has exact number of src vregs needed to assemble one dst vreg.
+  // We can not use circular buffer here because we need to have enough space to
+  // strided load/store.
+  int64_t sublanes_per_group = stride * sl_per_vreg * vregs_per_group;
+  int64_t max_groups_in_scratch =
+      ctx.max_sublanes_in_scratch / sublanes_per_group;
+  if (max_groups_in_scratch < 1) {
+    return emitError(loc,
+                     "scratch space is not enough for retiling to large tile");
+  }
+  int64_t stored_group_cnt = 0;
+  auto dst_vreg_ty = src_tiles.begin()->getType();
+  // Create a new vreg type that can be stored in scratch memref.
+  auto temp_vreg_ty =
+      VectorType::get(ctx.target_shape, scratch_ref.getType().getElementType());
+  SmallVector<bool, 8> sublane_mask(ctx.target_shape[0], true);
+  // (dst_vreg, load_offset)
+  std::vector<std::pair<Value *, Value>> delayed_loads;
+  delayed_loads.reserve(max_groups_in_scratch * vregs_per_group);
+  // We only emit the loads when we run out of scratch space or we are at the
+  // last vreg of the batch to help bundle scheduling.
+  auto emit_all_delayed_loads = [&]() {
+    for (auto [dst_vreg, load_offset] : delayed_loads) {
+      Value load_op = builder.create<tpu::ShuffledLoadOp>(
+          loc, temp_vreg_ty, scratch_ref, ArrayRef<Value>({load_offset, cst_0}),
+          ArrayRef<bool>(sublane_mask),
+          ArrayRef<int32_t>(sublane_offsets.begin(), sublane_offsets.end()));
+      *dst_vreg = builder.create<tpu::BitcastVregOp>(loc, dst_vreg_ty, load_op);
+    }
+    delayed_loads.clear();
+  };
+
+  int rank = src_tiles.dimensions().size();
+  if (rank != dst_tiles.dimensions().size()) {
+    return emitError(loc, "src and dst tiles have different ranks");
+  }
+  for (int i = 0; i < rank - 2; ++i) {
+    if (src_tiles.dim(i) != dst_tiles.dim(i)) {
+      return emitError(loc,
+                       "Expected src and dst tiles have same dimension "
+                       "sizes on dim")
+             << i << ", but got " << src_tiles.dim(i) << " vs "
+             << dst_tiles.dim(i);
+    }
+  }
+  SmallVector<int64_t, 4> src_idx(rank);
+  dst_tiles.Each([&](absl::Span<const int64_t> dst_idx, Value *dst_vreg) {
+    int64_t dst_row_idx = *(dst_idx.end() - 2);
+    int64_t dst_col_idx = *(dst_idx.end() - 1);
+    int64_t vreg_idx_in_group = dst_col_idx % vregs_per_group;
+    int64_t load_offset = sublanes_per_group * stored_group_cnt +
+                          vreg_idx_in_group * sl_per_vreg * stride;
+    delayed_loads.push_back(
+        std::make_pair(dst_vreg, mlirIndexConst(load_offset)));
+    // When dst vreg is at the last vreg of the group or the current dst
+    // vregs' row, this indicates we have scheduled delayed loads for all
+    // the vregs from current group and now we need to store corresponding
+    // group of src vregs before actually emitting the loads.
+    if (vreg_idx_in_group == vregs_per_group - 1 ||
+        dst_col_idx == dst_tiles.dimensions().back() - 1) {
+      auto src_row_idx = dst_row_idx * vregs_per_group;
+      auto src_col_idx = dst_col_idx / vregs_per_group;
+      std::copy(dst_idx.begin(), dst_idx.end(), src_idx.begin());
+      for (int vi = 0; vi < vregs_per_group; ++vi) {
+        if (src_row_idx + vi >= src_tiles.dim(rank - 2) ||
+            src_col_idx >= src_tiles.dim(rank - 1)) {
+          break;
+        }
+        *(src_idx.end() - 2) = src_row_idx + vi;
+        *(src_idx.end() - 1) = src_col_idx;
+        Value src_vreg = src_tiles(src_idx);
+        src_vreg =
+            builder.create<tpu::BitcastVregOp>(loc, temp_vreg_ty, src_vreg);
+        Value store_offset =
+            mlirIndexConst(sublanes_per_group * stored_group_cnt + vi);
+        builder.create<tpu::StoreOp>(
+            loc, src_vreg, scratch_ref, ArrayRef<Value>({store_offset, cst_0}),
+            ArrayRef<bool>(sublane_mask),
+            /*mask=*/nullptr, builder.getI32IntegerAttr(stride));
+      }
+      stored_group_cnt = (stored_group_cnt + 1) % max_groups_in_scratch;
+      // We emit loads when we run out of scratch space or we are at the
+      // last vreg of the batch.
+      if (stored_group_cnt == 0 ||
+          (*(dst_idx.end() - 2) == dst_tiles.dim(rank - 2) - 1 &&
+           *(dst_idx.end() - 1) == dst_tiles.dim(rank - 1) - 1)) {
+        emit_all_delayed_loads();
+      }
+    }
+  });
+  return success();
+}
+
+LogicalResult retileToSmallTileWithScratch(
+    RewriteContext &ctx, OpBuilder &builder, const Location loc,
+    xla::Array<Value> &dst_tiles, const std::array<int64_t, 2> &dst_tile,
+    const xla::Array<Value> &src_tiles, const std::array<int64_t, 2> &src_tile,
+    TypedValue<MemRefType> scratch_ref) {
+  if (src_tile[0] % dst_tile[0] != 0) {
+    return failure();
+  }
+  // Number of src vregs needed to assemble one dst vreg.
+  int vregs_per_group = src_tile[0] / dst_tile[0];
+  // Number of sublanes needed per src vreg to assemble one dst vreg.
+  int sl_per_vreg = ctx.target_shape[0] / vregs_per_group;
+  int stride = vregs_per_group;
+
+  xla::Array<int> sublane_offsets(
+      {ctx.target_shape[0] / src_tile[0], dst_tile[0], vregs_per_group}, 0);
+  absl::c_iota(sublane_offsets, 0);
+  // The older hardware has limited support for shuffles so even if we have
+  // bank conflicts, we just accept them and will have the lowering unroll the
+  // loads/stores.
+  bool should_handle_bank_confict =
+      ctx.hardware_generation >= 4 && ctx.vmem_banks > 0 &&
+      ctx.vmem_banks < stride * ctx.target_shape[0];
+  bool use_shuffled_load = false;
+  if (ctx.hardware_generation <= 4) {
+    if (src_tile[0] == 8) {
+      // The older hardware does not support shuffled store. However, if the src
+      // tile is (8, 128), we can convert (shuffled store + strided load) to
+      // (strided store + shuffled load).
+      use_shuffled_load = true;
+    } else if (src_tile[0] == 4) {
+      // In this case, the trick of replacing a shuffled store with a shuffled
+      // load does not work. Handling bank conflicts will cause the sublane
+      // offsets to increase which might make emulation harder, so we avoid
+      // doing so.
+      should_handle_bank_confict = false;
+    }
+  }
+
+  // Add one extra sublane to stride to avoid bank conflict.
+  if (should_handle_bank_confict) {
+    // Adjust sublane offsets to match the stride.
+    for (int i = 0; i < sublane_offsets.num_elements(); i += 1) {
+      *(sublane_offsets.begin() + i) += i / stride;
+    }
+    stride += 1;
+  }
+  sublane_offsets.TransposeDimensions({0, 2, 1});
+  auto mlirIndexConst = [&](int d) {
+    return builder.create<arith::ConstantOp>(
+        src_tiles.begin()->getLoc(),
+        builder.getIntegerAttr(builder.getIndexType(), d));
+  };
+  auto cst_0 = mlirIndexConst(0);
+  // Each group has exact number of src vregs needed to assemble one dst vreg.
+  // We can not use circular buffer here because we need to have enough space
+  // to strided load/store.
+  int64_t sublanes_per_group = stride * sl_per_vreg * vregs_per_group;
+  int64_t max_groups_in_scratch =
+      ctx.max_sublanes_in_scratch / sublanes_per_group;
+  if (max_groups_in_scratch < 1) {
+    return emitError(loc,
+                     "scratch space is not enough for retiling to small tile");
+  }
+  int64_t stored_group_cnt = 0;
+  auto dst_vreg_ty = src_tiles.begin()->getType();
+  // Create a new vreg type that can be stored in scratch memref.
+  auto temp_vreg_ty =
+      VectorType::get(ctx.target_shape, scratch_ref.getType().getElementType());
+  SmallVector<bool, 8> sublane_mask(ctx.target_shape[0], true);
+  // (dst_vreg, load_offset)
+  std::vector<std::pair<Value *, Value>> delayed_loads;
+  delayed_loads.reserve(max_groups_in_scratch * vregs_per_group);
+  // We only emit the loads when we run out of scratch space or we are at the
+  // last vreg of the batch to help bundle scheduling.
+  auto emit_all_delayed_loads = [&]() {
+    for (auto [dst_vreg, load_offset] : delayed_loads) {
+      Value load_op;
+      if (use_shuffled_load) {
+        load_op = builder.create<tpu::ShuffledLoadOp>(
+            loc, temp_vreg_ty, scratch_ref,
+            ArrayRef<Value>({load_offset, cst_0}), ArrayRef<bool>(sublane_mask),
+            ArrayRef<int32_t>(sublane_offsets.begin(), sublane_offsets.end()));
+      } else {
+        load_op = builder.create<tpu::LoadOp>(
+            loc, temp_vreg_ty, scratch_ref,
+            ArrayRef<Value>({load_offset, cst_0}), ArrayRef<bool>(sublane_mask),
+            builder.getI32IntegerAttr(stride));
+      }
+      *dst_vreg = builder.create<tpu::BitcastVregOp>(loc, dst_vreg_ty, load_op);
+    }
+    delayed_loads.clear();
+  };
+  int rank = src_tiles.dimensions().size();
+  if (rank != dst_tiles.dimensions().size()) {
+    return emitError(loc, "src and dst tiles have different ranks");
+  }
+  for (int i = 0; i < rank - 2; ++i) {
+    if (src_tiles.dim(i) != dst_tiles.dim(i)) {
+      return emitError(loc,
+                       "Expected src and dst tiles have same dimension "
+                       "sizes on dim")
+             << i << ", but got " << src_tiles.dim(i) << " vs "
+             << dst_tiles.dim(i);
+    }
+  }
+  SmallVector<int64_t, 4> dst_idx(rank);
+  src_tiles.Each([&](absl::Span<const int64_t> src_idx, Value src_vreg) {
+    int64_t src_row_idx = *(src_idx.end() - 2);
+    int64_t src_col_idx = *(src_idx.end() - 1);
+    int64_t vreg_idx_in_group = src_col_idx % vregs_per_group;
+    src_vreg = builder.create<tpu::BitcastVregOp>(loc, temp_vreg_ty, src_vreg);
+    if (use_shuffled_load) {
+      Value store_offset = mlirIndexConst(
+          sublanes_per_group * stored_group_cnt + vreg_idx_in_group);
+      builder.create<tpu::StoreOp>(
+          loc, src_vreg, scratch_ref, ArrayRef<Value>({store_offset, cst_0}),
+          ArrayRef<bool>(sublane_mask),
+          /*mask=*/nullptr, builder.getI32IntegerAttr(stride));
+    } else {
+      Value store_offset =
+          mlirIndexConst(sublanes_per_group * stored_group_cnt +
+                         vreg_idx_in_group * sl_per_vreg * stride);
+      builder.create<tpu::ShuffledStoreOp>(
+          loc, src_vreg, scratch_ref, ArrayRef<Value>({store_offset, cst_0}),
+          ArrayRef<bool>(sublane_mask),
+          ArrayRef<int32_t>(sublane_offsets.begin(), sublane_offsets.end()));
+    }
+    // When src vreg is at the last vreg of the group or the current src
+    // vregs' row, this indicates we have stored all the vregs needed to
+    // assemble a new group of dst vreg.
+    if (vreg_idx_in_group == vregs_per_group - 1 ||
+        src_col_idx == src_tiles.dimensions().back() - 1) {
+      auto dst_row_idx = src_row_idx * vregs_per_group;
+      auto dst_col_idx = src_col_idx / vregs_per_group;
+      std::copy(src_idx.begin(), src_idx.end(), dst_idx.begin());
+      for (int vi = 0; vi < vregs_per_group; ++vi) {
+        if (dst_row_idx + vi >= dst_tiles.dim(rank - 2) ||
+            dst_col_idx >= dst_tiles.dim(rank - 1)) {
+          break;
+        }
+        *(dst_idx.end() - 2) = dst_row_idx + vi;
+        *(dst_idx.end() - 1) = dst_col_idx;
+        Value *dst_vreg = &dst_tiles(dst_idx);
+        int64_t load_offset =
+            use_shuffled_load ? (sublanes_per_group * stored_group_cnt +
+                                 vi * sl_per_vreg * stride)
+                              : (sublanes_per_group * stored_group_cnt + vi);
+        delayed_loads.push_back(
+            std::make_pair(dst_vreg, mlirIndexConst(load_offset)));
+      }
+      stored_group_cnt = (stored_group_cnt + 1) % max_groups_in_scratch;
+      // We emit loads when we run out of scratch space or we are at the
+      // last vreg of the batch.
+      if (stored_group_cnt == 0 ||
+          (*(src_idx.end() - 2) == src_tiles.dim(rank - 2) - 1 &&
+           *(src_idx.end() - 1) == src_tiles.dim(rank - 1) - 1)) {
+        emit_all_delayed_loads();
+      }
+    }
+  });
+  return success();
+}
+
+// go/mosaic-retiling-in-scratch is the full internal documentation that
+// includes more details about the TPU generations.
+LogicalResult retileWithScratch(RewriteContext &ctx, OpBuilder &builder,
+                                const Location loc,
+                                xla::Array<Value> &dst_tiles,
+                                const std::array<int64_t, 2> &dst_tiling,
+                                const xla::Array<Value> &src_tiles,
+                                const std::array<int64_t, 2> &src_tiling,
+                                int packing) {
+  if (!(src_tiling[1] == ctx.target_shape[1] &&
+        dst_tiling[1] == ctx.target_shape[1] && src_tiling[0] % packing == 0 &&
+        dst_tiling[0] % packing == 0)) {
+    return failure();
+  }
+  // Try to get i32 vector scratch space. Because we will bitcast vregs to
+  // i32 vregs before using scratch for retiling. Through this way we can
+  // handle packed types as well.
+  auto vi32_scratch_ref = getInternalScratch(
+      ctx, builder, loc, {ctx.max_sublanes_in_scratch, ctx.target_shape[1]},
+      builder.getI32Type(), /*sublane_tiling=*/1);
+  if (failed(vi32_scratch_ref)) {
+    return emitError(loc, "Failed to get scratch ref for retiling");
+  }
+  auto ref = vi32_scratch_ref.value();
+  std::array<int64_t, 2> vi32_dst_tiling = {dst_tiling[0] / packing,
+                                            dst_tiling[1]};
+  std::array<int64_t, 2> vi32_src_tiling = {src_tiling[0] / packing,
+                                            src_tiling[1]};
+  if (src_tiling[0] > dst_tiling[0]) {
+    return retileToSmallTileWithScratch(ctx, builder, loc, dst_tiles,
+                                        vi32_dst_tiling, src_tiles,
+                                        vi32_src_tiling, ref);
+  }
+  if (src_tiling[0] < dst_tiling[0]) {
+    return retileToLargeTileWithScratch(ctx, builder, loc, dst_tiles,
+                                        vi32_dst_tiling, src_tiles,
+                                        vi32_src_tiling, ref);
+  }
+  dst_tiles = std::move(src_tiles);
+  return success();
+}
+
 FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeTiling(
     RewriteContext &ctx, OpBuilder &builder, const Location loc, VectorType vty,
     const VectorLayout src, xla::Array<Value> vregs,
     const std::array<int64_t, 2> dst_tiling, bool try_replicate_rows) {
+  bool has_enough_scratch = ctx.max_sublanes_in_scratch >=
+                            ctx.target_shape[0] * (ctx.target_shape[0] + 1);
   const auto &target_shape = ctx.target_shape;
-  if (src.tiling() == dst_tiling) {
+  const std::array<int64_t, 2> src_tiling = src.tiling();
+  if (src_tiling == dst_tiling) {
     return std::pair(src, std::move(vregs));
   }
   const int packing = src.packing();
@@ -5384,106 +5705,62 @@ FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeTiling(
   if (!dst.isValid(target_shape)) {
     return emitError(loc, "Not implemented: invalid offsets in tiling target");
   }
-  // Handle retiling from (packing, 128) to (8 * packing, 128).
-  if (src.offsets() == LayoutOffsets{0, 0} &&
-      src.tiling() == std::array<int64_t, 2>{packing, 128} &&
-      dst_tiling == std::array<int64_t, 2>{8 * packing, 128}) {
-    bool replicate_sublanes = try_replicate_rows && packing == 1 &&
-                              *(vregs.dimensions().end() - 2) == 1;
-    xla::Array<Value> retiled(
-        dst.tileArrayImplicitShape(vty.getShape(), target_shape));
+  auto dst_tiles_shape =
+      dst.tileArrayImplicitShape(vty.getShape(), target_shape);
+  // Handle retiling from (1, 128) to (8, 128) for 32-bit data with replicating
+  // sublanes.
+  if (try_replicate_rows && packing == 1 &&
+      *(vregs.dimensions().end() - 2) == 1 &&
+      src.offsets() == LayoutOffsets{0, 0} &&
+      src.tiling() == std::array<int64_t, 2>{1, 128} &&
+      dst_tiling == std::array<int64_t, 2>{8, 128}) {
+    xla::Array<Value> retiled(dst_tiles_shape);
     retiled.Each([&](absl::Span<const int64_t> idx, Value *tile) {
       SmallVector<int64_t> src_idx(idx.begin(), idx.end());
       *(src_idx.end() - 2) *= target_shape[0];
       *(src_idx.end() - 1) /= target_shape[0];
       const int64_t src_sl_idx = *(idx.end() - 1) % target_shape[0];
-      if (replicate_sublanes) {
-        CHECK_EQ(src.getImplicitTiledDims(vty.getShape(), 1)[0], 1);
-        *tile =
-            broadcastSublane(builder, vregs(src_idx), src_sl_idx, target_shape);
-      } else {
-        for (int dst_sl_idx = 0;
-             dst_sl_idx < target_shape[0] &&
-             *(src_idx.end() - 2) < *(vregs.dimensions().end() - 2);
-             ++dst_sl_idx, ++*(src_idx.end() - 2)) {
-          *tile = copy_one_sublane(builder, vregs(src_idx), src_sl_idx, *tile,
-                                   dst_sl_idx, target_shape);
-        }
-      }
+      CHECK_EQ(src.getImplicitTiledDims(vty.getShape(), 1)[0], 1);
+      *tile =
+          broadcastSublane(builder, vregs(src_idx), src_sl_idx, target_shape);
     });
     // We have successfully replicated sublanes.
-    if (replicate_sublanes) {
-      dst = VectorLayout(bitwidth, {std::nullopt, dst.offsets()[1]}, dst_tiling,
-                         dst.implicit_dim());
-    }
-    return std::pair(dst, std::move(retiled));
-  }
-  // Handle retiling from (m, 128) to (8, 128) for 32-bit data
-  // where m < 8 and m is a power of 2.
-  // TODO(b/306692696): Handle any vregs.dimensions().
-  if (bitwidth == 32 && src.offsets() == LayoutOffsets{0, 0} &&
-      target_shape[0] % src.tiling()[0] == 0 &&
-      src.tiling()[1] == target_shape[1] && dst.tiling() == target_shape &&
-      *(vregs.dimensions().end() - 2) == 1) {
-    xla::Array<Value> retiled(
-        dst.tileArrayImplicitShape(vty.getShape(), target_shape));
-    retiled.Each([&](const absl::Span<const int64_t> idx,
-                     Value *const new_src_tile) {
-      const int64_t tiles_per_vreg = src.tilesPerVreg(target_shape);
-      const int64_t dst_col = idx.back();
-      const int64_t src_col = dst_col / tiles_per_vreg;
-      const int64_t start_slane_idx =
-          src.tiling()[0] * (dst_col % tiles_per_vreg);
-      SmallVector<int64_t> src_idx(toArrayRef(idx));
-      src_idx.back() = src_col;
-      Value src_tile = vregs(src_idx);
-      if (start_slane_idx) {
-        SmallVector<int32_t> slane_idxs;
-        slane_idxs.reserve(target_shape[0]);
-        for (int i = 0; i < target_shape[0]; ++i) {
-          slane_idxs.push_back(start_slane_idx + (i % src.tiling()[0]));
-        }
-        const DenseI32ArrayAttr gather_indices =
-            builder.getDenseI32ArrayAttr(slane_idxs);
-        *new_src_tile = builder.create<tpu::GatherOp>(loc, src_tile.getType(),
-                                                      src_tile, gather_indices,
-                                                      /*dimension=*/0);
-      } else {
-        *new_src_tile = src_tile;
-      }
-    });
+    dst = VectorLayout(bitwidth, {std::nullopt, dst.offsets()[1]}, dst_tiling,
+                       dst.implicit_dim());
     return std::pair(dst, std::move(retiled));
   }
   // (8,128) -> (8 * packing,128) tiling change for packed type.
   if (bitwidth < 32 && 32 % bitwidth == 0 &&
-      src.tiling() == std::array<int64_t, 2>{8, 128} &&
-      dst.tiling() == std::array<int64_t, 2>{8 * dst.packing(), 128}) {
-    xla::Array<Value> retiled(
-        dst.tileArrayImplicitShape(vty.getShape(), target_shape));
-    int vty_packing = dst.packing();
-    VectorType vreg_x32 =
-        vty.getElementType().isSignlessInteger()
-            ? VectorType::get(target_shape, builder.getI32Type())
-            : VectorType::get(target_shape, builder.getF32Type());
-    retiled.Each([&](absl::Span<const int64_t> idx, Value *tile) {
-      const int vreg_part = idx.back() % vty_packing;
-      SmallVector<Value, 8> parts;
-      parts.reserve(vty_packing);
-      SmallVector<int64_t> src_idx(idx.begin(), idx.end());
-      src_idx[src_idx.size() - 2] *= vty_packing;
-      src_idx[src_idx.size() - 1] /= vty_packing;
-      for (int i = 0; i < vty_packing; ++i) {
-        parts.push_back(builder.create<tpu::UnpackSubelementsOp>(
-            loc, vreg_x32, vregs(src_idx), vreg_part));
-        if (src_idx[src_idx.size() - 2] <
-            vregs.dim(vregs.num_dimensions() - 2) - 1) {
-          ++src_idx[src_idx.size() - 2];
+      src_tiling == std::array<int64_t, 2>{8, 128} &&
+      dst_tiling == std::array<int64_t, 2>{8 * dst.packing(), 128}) {
+    // Note: for int4, retiling with scratch is always faster.
+    if (bitwidth != 4 || !has_enough_scratch) {
+      xla::Array<Value> retiled(dst_tiles_shape);
+      int vty_packing = dst.packing();
+      VectorType vreg_x32 =
+          vty.getElementType().isSignlessInteger()
+              ? VectorType::get(target_shape, builder.getI32Type())
+              : VectorType::get(target_shape, builder.getF32Type());
+      retiled.Each([&](absl::Span<const int64_t> idx, Value *tile) {
+        const int vreg_part = idx.back() % vty_packing;
+        SmallVector<Value, 8> parts;
+        parts.reserve(vty_packing);
+        SmallVector<int64_t> src_idx(idx.begin(), idx.end());
+        src_idx[src_idx.size() - 2] *= vty_packing;
+        src_idx[src_idx.size() - 1] /= vty_packing;
+        for (int i = 0; i < vty_packing; ++i) {
+          parts.push_back(builder.create<tpu::UnpackSubelementsOp>(
+              loc, vreg_x32, vregs(src_idx), vreg_part));
+          if (src_idx[src_idx.size() - 2] <
+              vregs.dim(vregs.num_dimensions() - 2) - 1) {
+            ++src_idx[src_idx.size() - 2];
+          }
         }
-      }
-      *tile = builder.create<tpu::PackSubelementsOp>(
-          loc, vregs.begin()->getType(), parts, tpu::PackFormat::kCompressed);
-    });
-    return std::pair(dst, std::move(retiled));
+        *tile = builder.create<tpu::PackSubelementsOp>(
+            loc, vregs.begin()->getType(), parts, tpu::PackFormat::kCompressed);
+      });
+      return std::pair(dst, std::move(retiled));
+    }
   }
   // Handle retiling from (1, 128 * packing) to (packing, 128) for
   // packed data.
@@ -5497,8 +5774,8 @@ FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeTiling(
   // match corresponding elements without shifting. It's just that
   // the tiles are not adjacent (no contiguous vreg slice).
   if (bitwidth < 32 && 32 % bitwidth == 0 &&
-      src.tiling() == std::array<int64_t, 2>{1, 128 * packing} &&
-      dst.tiling() == std::array<int64_t, 2>{packing, 128}) {
+      src_tiling == std::array<int64_t, 2>{1, 128 * packing} &&
+      dst_tiling == std::array<int64_t, 2>{packing, 128}) {
     // To illustrate, consider a 2 x 16 16-bit shape laid out in vregs of
     // 4 sublanes and 2 lanes (this is convenient for to keep the example small
     // yet non-trivial) with (1, 4) tiling. We will relayout to (2, 2) tiling.
@@ -5539,8 +5816,7 @@ FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeTiling(
     // [(a b) (A B) (c d) (C D) ...]. That is, traverse down each column before
     // moving to the next one. This is exactly an interleaving of the sublanes
     // of the vreg parts.
-    xla::Array<Value> retiled(
-        dst.tileArrayImplicitShape(vty.getShape(), target_shape));
+    xla::Array<Value> retiled(dst_tiles_shape);
     const VectorType vreg_x32 =
         vty.getElementType().isSignlessInteger()
             ? VectorType::get(target_shape, builder.getI32Type())
@@ -5565,13 +5841,41 @@ FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeTiling(
     });
     return std::pair(dst, std::move(retiled));
   }
-  if (isSupportedReducedSublanesRetile(src, dst, target_shape)) {
-    return std::pair(dst, retileToReducedSublanes(builder, vty.getShape(), src,
-                                                  vregs, dst, target_shape));
+  if (src_tiling[1] == target_shape[1] && dst_tiling[1] == target_shape[1]) {
+    // TODO(b/368088671): When sublane tiling changes, we should be able to
+    // preserve some replications from the source layout. But we need to
+    // make sure they are implemented efficiently and well-tested. For now, we
+    // just simply use 0 for the replicated offset after retiling.
+    dst = VectorLayout(
+        bitwidth, {src.offsets()[0].value_or(0), src.offsets()[1].value_or(0)},
+        dst_tiling, dst.implicit_dim());
+
+    // All clauses in the and expression are based on performance benchmarking.
+    bool use_alu = !has_enough_scratch ||
+                   (ctx.hardware_generation >= 5 && src_tiling[0] != packing &&
+                    dst_tiling[0] != packing);
+
+    if (use_alu) {
+      if (src_tiling[0] > dst_tiling[0]) {
+        return std::pair(
+            dst, retileToReducedSublanes(builder, vty.getShape(), src, vregs,
+                                         dst, target_shape));
+      } else if (!has_enough_scratch) {
+        // TODO(b/357538782): Implement retileToIncreasedSublanes with ALU ops.
+        return emitError(
+            loc,
+            "Not implemented: retiling to increase sublane tiling with ALU");
+      }
+    }
+    xla::Array<Value> retiled(dst_tiles_shape);
+    if (failed(retileWithScratch(ctx, builder, loc, retiled, dst_tiling, vregs,
+                                 src_tiling, packing))) {
+      return failure();
+    }
+    return std::pair(dst, std::move(retiled));
   }
   return emitError(loc, "Not implemented: Unsupported tiling change for ")
-         << vty << ": from " << src << " to tiling (" << dst_tiling[0] << ", "
-         << dst_tiling[1] << ")";
+         << vty << ": from " << src << " to " << dst;
 }
 
 FailureOr<std::pair<VectorLayout, xla::Array<Value>>> changeImplicitDim(
@@ -5878,6 +6182,7 @@ struct ApplyVectorLayoutPass
     mxu_contracting_size = ctx.mxu_shape[0];
     mxu_noncontracting_size = ctx.mxu_shape[1];
     max_sublanes_in_scratch = ctx.max_sublanes_in_scratch;
+    vmem_banks = ctx.vmem_banks;
   }
   void runOnOperation() override {
     // Fail if hardware_generation has not been set from the default value.
@@ -5889,7 +6194,8 @@ struct ApplyVectorLayoutPass
         .hardware_generation = hardware_generation,
         .target_shape = {sublane_count, lane_count},
         .mxu_shape = {mxu_contracting_size, mxu_noncontracting_size},
-        .max_sublanes_in_scratch = max_sublanes_in_scratch};
+        .max_sublanes_in_scratch = max_sublanes_in_scratch,
+        .vmem_banks = vmem_banks};
     if (failed(applyLayoutFunc(ctx, getOperation()))) {
       signalPassFailure();
       return;
diff --git a/tests/pallas/tpu_pallas_test.py b/tests/pallas/tpu_pallas_test.py
index 84403e41b561..87ccaa644e8c 100644
--- a/tests/pallas/tpu_pallas_test.py
+++ b/tests/pallas/tpu_pallas_test.py
@@ -2464,9 +2464,7 @@ def kernel(x_ref, out_ref):
     )(x)
     np.testing.assert_array_equal(out, np.broadcast_to(x, (256, 512)))
 
-  @only_passes_in_interpret(unless_generation=4)
   def test_bfloat16_to_uint32_bitcast(self):
-    """b/347771903"""
     x = np.arange(16 * 2 * 256, dtype=jnp.bfloat16).reshape(16, 2, 256)
 
     def kernel(x_ref, out_ref):
@@ -2475,7 +2473,7 @@ def kernel(x_ref, out_ref):
     out = self.pallas_call(
         kernel, out_shape=jax.ShapeDtypeStruct((16, 1, 256), jnp.uint32)
     )(x)
-    # FIXME: Add correctness test for result.
+    np.testing.assert_array_equal(out, state_utils.bitcast(x, jnp.uint32))
 
   @only_passes_in_interpret()
   def test_roll_partial(self):
@@ -2548,9 +2546,7 @@ def kernel(x_ref, out_ref):
 
     np.testing.assert_array_equal(out, np.reshape(x, (8, 1, 128)))
 
-  @only_passes_in_interpret()
   def test_mixed_strides(self):
-    """b/352841329"""
     x = np.zeros((8, 128), dtype=jnp.float32)
     y = np.zeros((8, 2, 128), dtype=jnp.bfloat16)