iree-org · Muzammiluddin-Syed-ECE · Jan 16, 2026 · Dec 20, 2025 · Jan 10, 2026 · Jan 15, 2026
@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -61,7 +62,6 @@ getTiledOps(Operation *funcOp, IREE::GPU::TilingLevel tilingLevel) {
 
 void GPUApplyTilingLevelPass::runOnOperation() {
   FunctionOpInterface funcOp = getOperation();
-
   if (!llvm::is_contained({IREE::GPU::TilingLevel::Reduction,
                            IREE::GPU::TilingLevel::Thread,
                            IREE::GPU::TilingLevel::Subgroup,
@@ -107,6 +107,7 @@ void GPUApplyTilingLevelPass::runOnOperation() {
   // Apply cleanup patterns.
   {
     RewritePatternSet patterns(context);
+    IREE::GPU::populateFoldSwizzleHintOpPatterns(patterns);
     // Merge consecutive insert/extract slice ops to simplify later loop
     // hoisting patterns.
     tensor::populateFoldTensorEmptyPatterns(patterns);

@@ -37,6 +37,7 @@ iree_lit_test_suite(
             "gpu_distribute_scf_for.mlir",
             "gpu_distribute_shared_memory.mlir",
             "gpu_expand_dimensions.mlir",
+            "gpu_fold_swizzle_hint_ops.mlir",
             "gpu_fuse_and_hoist_forall.mlir",
             "gpu_generalize_named_ops.mlir",
             "gpu_greedily_distribute_to_threads.mlir",

@@ -33,6 +33,7 @@ iree_lit_test_suite(
     "gpu_distribute_scf_for.mlir"
     "gpu_distribute_shared_memory.mlir"
     "gpu_expand_dimensions.mlir"
+    "gpu_fold_swizzle_hint_ops.mlir"
     "gpu_fuse_and_hoist_forall.mlir"
     "gpu_generalize_named_ops.mlir"
     "gpu_greedily_distribute_to_threads.mlir"

@@ -735,3 +735,42 @@ module {
 // SERIAL: linalg.generic
 // SERIAL: scf.forall.in_parallel
 // SERIAL-NOT: mapping
+
+// -----
+
+func.func @matmul_transpose_b_with_swizzle(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7: tensor<64x1280xf16>) -> tensor<64x64xf32> {
+  %c4 = arith.constant 4 : index
+  %c1280 = arith.constant 1280 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32>
+  %9 = tensor.empty() : tensor<64x1280xf16>
+  %swizzle_9 = iree_codegen.swizzle_hint %9[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x1280xf16>
+  %10 = tensor.empty() : tensor<64x1280xf16>
+  %swizzle_10 = iree_codegen.swizzle_hint %10[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x1280xf16>
+  %11 = scf.for %arg0 = %c0 to %c1280 step %c4 iter_args(%arg1 = %8) -> (tensor<64x64xf32>) {
+    %extracted_slice = tensor.extract_slice %6[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
+    %extracted_slice_0 = tensor.extract_slice %swizzle_9[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
+    %12 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice : tensor<64x4xf16>) outs(%extracted_slice_0 : tensor<64x4xf16>) -> tensor<64x4xf16>
+    %extracted_slice_1 = tensor.extract_slice %7[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
+    %extracted_slice_2 = tensor.extract_slice %swizzle_10[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
+    %13 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice_1 : tensor<64x4xf16>) outs(%extracted_slice_2 : tensor<64x4xf16>) -> tensor<64x4xf16>
+    %14 = linalg.matmul
+      indexing_maps = [
+        affine_map<(d0, d1, d2) -> (d0, d2)>,
+        affine_map<(d0, d1, d2) -> (d1, d2)>,
+        affine_map<(d0, d1, d2) -> (d0, d1)>
+      ]
+      {lowering_config = #iree_gpu.lowering_config<{thread = [4, 4]}>}
+      ins(%12, %13 : tensor<64x4xf16>, tensor<64x4xf16>)
+      outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32>
+    scf.yield %14 : tensor<64x64xf32>
+  }
+  return %11 : tensor<64x64xf32>
+}
+
+// CHECK-LABEL: func.func @matmul_transpose_b_with_swizzle
+
+// THREAD-LABEL: func.func @matmul_transpose_b_with_swizzle
+//       THREAD:     %2 = tensor.empty() : tensor<64x4xf16>
+//       THREAD:     %3 = iree_codegen.swizzle_hint %2[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x4xf16>
@@ -0,0 +1,120 @@
+// RUN: iree-opt --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level, canonicalize, cse))" %s | FileCheck %s
+
+// Test: tensor.extract_slice of swizzle_hint(tensor.empty) should fold
+// to swizzle_hint(tensor.empty) with the sliced shape.
+func.func @fold_extract_slice_of_swizzle_hint() -> tensor<16x32xf32> {
+  %empty = tensor.empty() : tensor<64x64xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+  %slice = tensor.extract_slice %swizzle[0, 0] [16, 32] [1, 1] : tensor<64x64xf32> to tensor<16x32xf32>
+  return %slice : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: func.func @fold_extract_slice_of_swizzle_hint
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<16x32xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<16x32xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+// Test: tensor.extract_slice with dynamic sizes should fold correctly.
+func.func @fold_extract_slice_dynamic(%size0: index, %size1: index) -> tensor<?x?xf32> {
+  %empty = tensor.empty() : tensor<64x64xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.xor_shuffle<128, 16>] : tensor<64x64xf32>
+  %slice = tensor.extract_slice %swizzle[0, 0] [%size0, %size1] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
+  return %slice : tensor<?x?xf32>
+}
+
+// CHECK-LABEL: func.func @fold_extract_slice_dynamic
+//  CHECK-SAME:   %[[SIZE0:[A-Za-z0-9]+]]: index
+//  CHECK-SAME:   %[[SIZE1:[A-Za-z0-9]+]]: index
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty(%[[SIZE0]], %[[SIZE1]]) : tensor<?x?xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.xor_shuffle<128, 16>] : tensor<?x?xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+// Test: tensor.expand_shape of swizzle_hint(tensor.empty) should fold
+// to swizzle_hint(tensor.empty) with the expanded shape.
+func.func @fold_expand_shape_of_swizzle_hint() -> tensor<4x16x64xf32> {
+  %empty = tensor.empty() : tensor<64x64xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+  %expanded = tensor.expand_shape %swizzle [[0, 1], [2]] output_shape [4, 16, 64] : tensor<64x64xf32> into tensor<4x16x64xf32>
+  return %expanded : tensor<4x16x64xf32>
+}
+
+// CHECK-LABEL: func.func @fold_expand_shape_of_swizzle_hint
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<4x16x64xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<4x16x64xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+// Test: tensor.collapse_shape of swizzle_hint(tensor.empty) should fold
+// to swizzle_hint(tensor.empty) with the collapsed shape.
+func.func @fold_collapse_shape_of_swizzle_hint() -> tensor<64x64xf32> {
+  %empty = tensor.empty() : tensor<4x16x4x16xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<4x16x4x16xf32>
+  %collapsed = tensor.collapse_shape %swizzle [[0, 1], [2, 3]] : tensor<4x16x4x16xf32> into tensor<64x64xf32>
+  return %collapsed : tensor<64x64xf32>
+}
+
+// CHECK-LABEL: func.func @fold_collapse_shape_of_swizzle_hint
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<64x64xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+// Negative test: extract_slice of swizzle_hint without tensor.empty source
+// should NOT fold.
+func.func @no_fold_extract_slice_non_empty(%arg0: tensor<64x64xf32>) -> tensor<16x32xf32> {
+  %swizzle = iree_codegen.swizzle_hint %arg0[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+  %slice = tensor.extract_slice %swizzle[0, 0] [16, 32] [1, 1] : tensor<64x64xf32> to tensor<16x32xf32>
+  return %slice : tensor<16x32xf32>
+}
+
+// CHECK-LABEL: func.func @no_fold_extract_slice_non_empty
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<64x64xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[ARG0]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+//       CHECK:   %[[SLICE:.+]] = tensor.extract_slice %[[SWIZZLE]]
+//       CHECK:   return %[[SLICE]]
+
+// Negative test: expand_shape of swizzle_hint without tensor.empty source
+// should NOT fold.
+func.func @no_fold_expand_shape_non_empty(%arg0: tensor<64x64xf32>) -> tensor<4x16x64xf32> {
+  %swizzle = iree_codegen.swizzle_hint %arg0[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+  %expanded = tensor.expand_shape %swizzle [[0, 1], [2]] output_shape [4, 16, 64] : tensor<64x64xf32> into tensor<4x16x64xf32>
+  return %expanded : tensor<4x16x64xf32>
+}
+
+// CHECK-LABEL: func.func @no_fold_expand_shape_non_empty
+//  CHECK-SAME:   %[[ARG0:[A-Za-z0-9]+]]: tensor<64x64xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[ARG0]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+//       CHECK:   %[[EXPANDED:.+]] = tensor.expand_shape %[[SWIZZLE]]
+//       CHECK:   return %[[EXPANDED]]
+
+// Test: XOR shuffle swizzle attribute is preserved through folding.
+func.func @fold_xor_shuffle_swizzle() -> tensor<8x64xf32> {
+  %empty = tensor.empty() : tensor<16x128xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.xor_shuffle<128, 16>] : tensor<16x128xf32>
+  %slice = tensor.extract_slice %swizzle[0, 0] [8, 64] [1, 1] : tensor<16x128xf32> to tensor<8x64xf32>
+  return %slice : tensor<8x64xf32>
+}
+
+// CHECK-LABEL: func.func @fold_xor_shuffle_swizzle
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<8x64xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.xor_shuffle<128, 16>] : tensor<8x64xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+// Test: Rank-reducing extract_slice should work correctly.
+func.func @fold_rank_reducing_extract_slice() -> tensor<32xf32> {
+  %empty = tensor.empty() : tensor<64x64xf32>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
+  %slice = tensor.extract_slice %swizzle[0, 0] [1, 32] [1, 1] : tensor<64x64xf32> to tensor<32xf32>
+  return %slice : tensor<32xf32>
+}
+
+// CHECK-LABEL: func.func @fold_rank_reducing_extract_slice
+//       CHECK:   %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32>
+//       CHECK:   %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<32xf32>
+//       CHECK:   return %[[SWIZZLE]]
+
+#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [bf16, bf16, bf16], user_indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (k, n)>, affine_map<(m, n, k) -> (m, n)>], iteration_sizes = [?, ?, ?]>
+func.func @fold_swizzle_hint_of_encoding() -> tensor<16xbf16,#encoding> {
+  %empty = tensor.empty() : tensor<8x16xbf16, #encoding>
+  %swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<8, 4>] : tensor<8x16xbf16, #encoding>
+  %slice = tensor.extract_slice %swizzle[0, 0] [1, 16] [1, 1] : tensor<8x16xbf16, #encoding> to tensor<16xbf16,#encoding>
+  return %slice : tensor<16xbf16,#encoding>
+}
@@ -173,9 +173,9 @@ def IREECodegen_SwizzleHintOp : Op<IREECodegen_Dialect, "swizzle_hint", [
     is otherwise perfectly legal.
   }];
 
-  let arguments = (ins RankedTensorOrMemRefOf<[AnyType], [1]>:$operand,
+  let arguments = (ins AnyRankedTensorOrMemRef:$operand,
                        IREECodegen_AnySwizzleAttr:$swizzle);
-  let results = (outs RankedTensorOrMemRefOf<[AnyType], [1]>:$result);
+  let results = (outs AnyRankedTensorOrMemRef:$result);
 
   let assemblyFormat = [{
     $operand `[` $swizzle attr-dict `]` `:` type($result)

@@ -101,24 +101,44 @@ static FailureOr<Value> createSharedAllocDestination(RewriterBase &rewriter,
     return failure();
   }
 
-  auto empty = forallOp.getDpsInits()[0].getDefiningOp<tensor::EmptyOp>();
+  // Skip swizzle hint ops.
+  Operation *destination = forallOp.getDpsInits()[0].getDefiningOp();
+  if (auto swizzleOp = dyn_cast<IREE::Codegen::SwizzleHintOp>(destination)) {
+    destination = swizzleOp->getOperand(0).getDefiningOp();
+  }
+
   // Fail if the destination is not a `tensor.empty` op and cannot be trivially
   // converted to a `bufferization.alloc_tensor`.
+  auto empty = dyn_cast<tensor::EmptyOp>(destination);
   if (!empty) {
     return failure();
   }
 
   // Create a `bufferization.alloc_tensor` op with memory space
   // `#gpu.address_space<workgroup>`.
+  Location loc = empty->getLoc();
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(empty);
   Attribute sharedMemoryAddrSpace = gpu::AddressSpaceAttr::get(
       rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
   auto allocTensor = bufferization::AllocTensorOp::create(
-      rewriter, empty->getLoc(), cast<TensorType>(empty.getResult().getType()),
+      rewriter, loc, cast<TensorType>(empty.getResult().getType()),
       empty.getDynamicSizes(),
       /*copy=*/Value(), /*size_hint=*/Value(),
       /*memory_space=*/sharedMemoryAddrSpace);
+
+  // If the original `tensor.empty` has a swizzle hint, apply it to the new
+  // allocation. Note that if there is a swizzle hint, it will be the only user
+  // of the `tensor.empty` op.
+  if (auto swizzleHintOp =
+          dyn_cast<IREE::Codegen::SwizzleHintOp>(*empty->getUsers().begin())) {
+    assert(swizzleHintOp->hasOneUse() &&
+           "a tensor.empty op with a swizzle hint applied, should have the "
+           "swizzle hint as its only user");
+    auto newSwizzle = IREE::Codegen::SwizzleHintOp::create(
+        rewriter, loc, allocTensor.getResult(), swizzleHintOp.getSwizzle());
+    return newSwizzle.getResult();
+  }
   return allocTensor.getResult();
 }
 
@@ -2065,4 +2085,107 @@ void populateIREEGPULowerValueBarrierPatterns(RewritePatternSet &patterns) {
   patterns.add<LowerValueBarrierPattern>(patterns.getContext());
 }
 
+//===----------------------------------------------------------------------===//
+// SwizzleHintOp Fold Patterns
+//===----------------------------------------------------------------------===//
+
+// The following patterns are adapted from the populateFoldTensorEmptyPatterns
+// in upstream llvm-project. The main change is to add support for folding with
+// swizzle_hint ops from IREE. Once swizzle_hint ops are more widely used and
+// proven stable, we could consider upstreaming this extension.
+
+namespace {
+struct FoldSwizzleHintOpWithExtractSliceOp final
+    : OpRewritePattern<tensor::ExtractSliceOp> {
+  using Base::Base;
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
+                                PatternRewriter &rewriter) const override {
+    // Check for swizzle_hint op source.
+    auto swizzleHintOp =
+        sliceOp.getSource().getDefiningOp<IREE::Codegen::SwizzleHintOp>();
+    if (!swizzleHintOp) {
+      return failure();
+    }
+
+    // Check for tensor.empty source.
+    auto emptyOp = swizzleHintOp.getOperand().getDefiningOp<tensor::EmptyOp>();
+    if (!emptyOp) {
+      return failure();
+    }
+
+    // Check for single use.
+    if (!emptyOp->hasOneUse()) {
+      return failure();
+    }
+
+    // Create new tensor.empty op. tensor.extract_slice may be rank-reducing;
+    // its dynamic sizes must be preserved as well as its result type.
+    Location loc = sliceOp.getLoc();
+    auto sliceType = cast<RankedTensorType>(sliceOp.getType());
+    auto tensorType =
+        RankedTensorType::get(sliceType.getShape(), sliceType.getElementType(),
+                              sliceType.getEncoding());
+    auto newEmptyOp =
+        tensor::EmptyOp::create(rewriter, loc, tensorType, sliceOp.getSizes());
+    rewriter.replaceOpWithNewOp<IREE::Codegen::SwizzleHintOp>(
+        sliceOp, newEmptyOp, swizzleHintOp.getSwizzle());
+    return success();
+  }
+};
+
+template <typename ReshapeOp>
+struct FoldSwizzleHintOpWithReshapeOp final : OpRewritePattern<ReshapeOp> {
+  using OpRewritePattern<ReshapeOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ReshapeOp reshapeOp,
+                                PatternRewriter &rewriter) const override {
+    auto swizzleHintOp =
+        reshapeOp.getSrc()
+            .template getDefiningOp<IREE::Codegen::SwizzleHintOp>();
+    if (!swizzleHintOp) {
+      return failure();
+    }
+    auto emptyOp =
+        swizzleHintOp.getOperand().template getDefiningOp<tensor::EmptyOp>();
+    if (!emptyOp) {
+      return failure();
+    }
+
+    // Check for single use.
+    if (!emptyOp->hasOneUse()) {
+      return failure();
+    }
+
+    // Reify result shape.
+    Location loc = reshapeOp.getLoc();
+    ReifiedRankedShapedTypeDims resultShapes;
+    if (failed(reifyResultShapes(rewriter, reshapeOp, resultShapes)) ||
+        !llvm::hasSingleElement(resultShapes)) {
+      return failure();
+    }
+
+    // Create new tensor.empty op.
+    Value emptyTensor =
+        tensor::EmptyOp::create(rewriter, loc, resultShapes[0],
+                                reshapeOp.getResultType().getElementType(),
+                                reshapeOp.getResultType().getEncoding());
+    Value newSwizzleHintOp = IREE::Codegen::SwizzleHintOp::create(
+        rewriter, loc, emptyTensor, swizzleHintOp.getSwizzle());
+    if (newSwizzleHintOp.getType() != reshapeOp.getResultType()) {
+      rewriter.replaceOpWithNewOp<tensor::CastOp>(
+          reshapeOp, reshapeOp.getResultType(), newSwizzleHintOp);
+    } else {
+      rewriter.replaceOp(reshapeOp, newSwizzleHintOp);
+    }
+    return success();
+  }
+};
+
+} // namespace
+
+void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns) {
+  patterns.add<FoldSwizzleHintOpWithReshapeOp<tensor::ExpandShapeOp>,
+               FoldSwizzleHintOpWithReshapeOp<tensor::CollapseShapeOp>,
+               FoldSwizzleHintOpWithExtractSliceOp>(patterns.getContext());
+}
+
 } // namespace mlir::iree_compiler::IREE::GPU
@@ -195,6 +195,9 @@ void populateIREEGPUVectorUnrollPatterns(
 void populateIREEGPUVectorUnrollPatterns(RewritePatternSet &patterns);
 void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns);
 
+// Populate patterns to fold tensor.empty ops through swizzle hint ops.
+void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns);
+
 } // namespace mlir::iree_compiler::IREE::GPU
 
 #endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H_