Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLForwardCompat.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
Expand Down Expand Up @@ -61,7 +62,6 @@ getTiledOps(Operation *funcOp, IREE::GPU::TilingLevel tilingLevel) {

void GPUApplyTilingLevelPass::runOnOperation() {
FunctionOpInterface funcOp = getOperation();

if (!llvm::is_contained({IREE::GPU::TilingLevel::Reduction,
IREE::GPU::TilingLevel::Thread,
IREE::GPU::TilingLevel::Subgroup,
Expand Down Expand Up @@ -107,6 +107,7 @@ void GPUApplyTilingLevelPass::runOnOperation() {
// Apply cleanup patterns.
{
RewritePatternSet patterns(context);
IREE::GPU::populateFoldSwizzleHintOpPatterns(patterns);
// Merge consecutive insert/extract slice ops to simplify later loop
// hoisting patterns.
tensor::populateFoldTensorEmptyPatterns(patterns);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ iree_lit_test_suite(
"gpu_distribute_scf_for.mlir",
"gpu_distribute_shared_memory.mlir",
"gpu_expand_dimensions.mlir",
"gpu_fold_swizzle_hint_ops.mlir",
"gpu_fuse_and_hoist_forall.mlir",
"gpu_generalize_named_ops.mlir",
"gpu_greedily_distribute_to_threads.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ iree_lit_test_suite(
"gpu_distribute_scf_for.mlir"
"gpu_distribute_shared_memory.mlir"
"gpu_expand_dimensions.mlir"
"gpu_fold_swizzle_hint_ops.mlir"
"gpu_fuse_and_hoist_forall.mlir"
"gpu_generalize_named_ops.mlir"
"gpu_greedily_distribute_to_threads.mlir"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -735,3 +735,42 @@ module {
// SERIAL: linalg.generic
// SERIAL: scf.forall.in_parallel
// SERIAL-NOT: mapping

// -----

func.func @matmul_transpose_b_with_swizzle(%5: tensor<64x64xf32>, %6: tensor<64x1280xf16>, %7: tensor<64x1280xf16>) -> tensor<64x64xf32> {
%c4 = arith.constant 4 : index
%c1280 = arith.constant 1280 : index
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<64x64xf32>) -> tensor<64x64xf32>
%9 = tensor.empty() : tensor<64x1280xf16>
%swizzle_9 = iree_codegen.swizzle_hint %9[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x1280xf16>
%10 = tensor.empty() : tensor<64x1280xf16>
%swizzle_10 = iree_codegen.swizzle_hint %10[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x1280xf16>
%11 = scf.for %arg0 = %c0 to %c1280 step %c4 iter_args(%arg1 = %8) -> (tensor<64x64xf32>) {
%extracted_slice = tensor.extract_slice %6[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
%extracted_slice_0 = tensor.extract_slice %swizzle_9[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
%12 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice : tensor<64x4xf16>) outs(%extracted_slice_0 : tensor<64x4xf16>) -> tensor<64x4xf16>
%extracted_slice_1 = tensor.extract_slice %7[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
%extracted_slice_2 = tensor.extract_slice %swizzle_10[0, %arg0] [64, 4] [1, 1] : tensor<64x1280xf16> to tensor<64x4xf16>
%13 = linalg.copy {lowering_config = #iree_gpu.lowering_config<{thread = [1, 1]}>} ins(%extracted_slice_1 : tensor<64x4xf16>) outs(%extracted_slice_2 : tensor<64x4xf16>) -> tensor<64x4xf16>
%14 = linalg.matmul
indexing_maps = [
affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>
]
{lowering_config = #iree_gpu.lowering_config<{thread = [4, 4]}>}
ins(%12, %13 : tensor<64x4xf16>, tensor<64x4xf16>)
outs(%arg1 : tensor<64x64xf32>) -> tensor<64x64xf32>
scf.yield %14 : tensor<64x64xf32>
}
return %11 : tensor<64x64xf32>
}

// CHECK-LABEL: func.func @matmul_transpose_b_with_swizzle

// THREAD-LABEL: func.func @matmul_transpose_b_with_swizzle
// THREAD: %2 = tensor.empty() : tensor<64x4xf16>
// THREAD: %3 = iree_codegen.swizzle_hint %2[#iree_codegen.xor_shuffle<256, 32>] : tensor<64x4xf16>
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// RUN: iree-opt --mlir-print-local-scope --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-apply-tiling-level, canonicalize, cse))" %s | FileCheck %s

// Test: tensor.extract_slice of swizzle_hint(tensor.empty) should fold
// to swizzle_hint(tensor.empty) with the sliced shape.
func.func @fold_extract_slice_of_swizzle_hint() -> tensor<16x32xf32> {
%empty = tensor.empty() : tensor<64x64xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
%slice = tensor.extract_slice %swizzle[0, 0] [16, 32] [1, 1] : tensor<64x64xf32> to tensor<16x32xf32>
return %slice : tensor<16x32xf32>
}

// CHECK-LABEL: func.func @fold_extract_slice_of_swizzle_hint
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<16x32xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<16x32xf32>
// CHECK: return %[[SWIZZLE]]

// Test: tensor.extract_slice with dynamic sizes should fold correctly.
func.func @fold_extract_slice_dynamic(%size0: index, %size1: index) -> tensor<?x?xf32> {
%empty = tensor.empty() : tensor<64x64xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.xor_shuffle<128, 16>] : tensor<64x64xf32>
%slice = tensor.extract_slice %swizzle[0, 0] [%size0, %size1] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
return %slice : tensor<?x?xf32>
}

// CHECK-LABEL: func.func @fold_extract_slice_dynamic
// CHECK-SAME: %[[SIZE0:[A-Za-z0-9]+]]: index
// CHECK-SAME: %[[SIZE1:[A-Za-z0-9]+]]: index
// CHECK: %[[EMPTY:.+]] = tensor.empty(%[[SIZE0]], %[[SIZE1]]) : tensor<?x?xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.xor_shuffle<128, 16>] : tensor<?x?xf32>
// CHECK: return %[[SWIZZLE]]

// Test: tensor.expand_shape of swizzle_hint(tensor.empty) should fold
// to swizzle_hint(tensor.empty) with the expanded shape.
func.func @fold_expand_shape_of_swizzle_hint() -> tensor<4x16x64xf32> {
%empty = tensor.empty() : tensor<64x64xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
%expanded = tensor.expand_shape %swizzle [[0, 1], [2]] output_shape [4, 16, 64] : tensor<64x64xf32> into tensor<4x16x64xf32>
return %expanded : tensor<4x16x64xf32>
}

// CHECK-LABEL: func.func @fold_expand_shape_of_swizzle_hint
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<4x16x64xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<4x16x64xf32>
// CHECK: return %[[SWIZZLE]]

// Test: tensor.collapse_shape of swizzle_hint(tensor.empty) should fold
// to swizzle_hint(tensor.empty) with the collapsed shape.
func.func @fold_collapse_shape_of_swizzle_hint() -> tensor<64x64xf32> {
%empty = tensor.empty() : tensor<4x16x4x16xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<4x16x4x16xf32>
%collapsed = tensor.collapse_shape %swizzle [[0, 1], [2, 3]] : tensor<4x16x4x16xf32> into tensor<64x64xf32>
return %collapsed : tensor<64x64xf32>
}

// CHECK-LABEL: func.func @fold_collapse_shape_of_swizzle_hint
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<64x64xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
// CHECK: return %[[SWIZZLE]]

// Negative test: extract_slice of swizzle_hint without tensor.empty source
// should NOT fold.
func.func @no_fold_extract_slice_non_empty(%arg0: tensor<64x64xf32>) -> tensor<16x32xf32> {
%swizzle = iree_codegen.swizzle_hint %arg0[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
%slice = tensor.extract_slice %swizzle[0, 0] [16, 32] [1, 1] : tensor<64x64xf32> to tensor<16x32xf32>
return %slice : tensor<16x32xf32>
}

// CHECK-LABEL: func.func @no_fold_extract_slice_non_empty
// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<64x64xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[ARG0]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[SWIZZLE]]
// CHECK: return %[[SLICE]]

// Negative test: expand_shape of swizzle_hint without tensor.empty source
// should NOT fold.
func.func @no_fold_expand_shape_non_empty(%arg0: tensor<64x64xf32>) -> tensor<4x16x64xf32> {
%swizzle = iree_codegen.swizzle_hint %arg0[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
%expanded = tensor.expand_shape %swizzle [[0, 1], [2]] output_shape [4, 16, 64] : tensor<64x64xf32> into tensor<4x16x64xf32>
return %expanded : tensor<4x16x64xf32>
}

// CHECK-LABEL: func.func @no_fold_expand_shape_non_empty
// CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<64x64xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[ARG0]][#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
// CHECK: %[[EXPANDED:.+]] = tensor.expand_shape %[[SWIZZLE]]
// CHECK: return %[[EXPANDED]]

// Test: XOR shuffle swizzle attribute is preserved through folding.
func.func @fold_xor_shuffle_swizzle() -> tensor<8x64xf32> {
%empty = tensor.empty() : tensor<16x128xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.xor_shuffle<128, 16>] : tensor<16x128xf32>
%slice = tensor.extract_slice %swizzle[0, 0] [8, 64] [1, 1] : tensor<16x128xf32> to tensor<8x64xf32>
return %slice : tensor<8x64xf32>
}

// CHECK-LABEL: func.func @fold_xor_shuffle_swizzle
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x64xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.xor_shuffle<128, 16>] : tensor<8x64xf32>
// CHECK: return %[[SWIZZLE]]

// Test: Rank-reducing extract_slice should work correctly.
func.func @fold_rank_reducing_extract_slice() -> tensor<32xf32> {
%empty = tensor.empty() : tensor<64x64xf32>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<64, 4>] : tensor<64x64xf32>
%slice = tensor.extract_slice %swizzle[0, 0] [1, 32] [1, 1] : tensor<64x64xf32> to tensor<32xf32>
return %slice : tensor<32xf32>
}

// CHECK-LABEL: func.func @fold_rank_reducing_extract_slice
// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<32xf32>
// CHECK: %[[SWIZZLE:.+]] = iree_codegen.swizzle_hint %[[EMPTY]][#iree_codegen.rotate_rows<64, 4>] : tensor<32xf32>
// CHECK: return %[[SWIZZLE]]

#encoding = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [bf16, bf16, bf16], user_indexing_maps = [affine_map<(m, n, k) -> (m, k)>, affine_map<(m, n, k) -> (k, n)>, affine_map<(m, n, k) -> (m, n)>], iteration_sizes = [?, ?, ?]>
func.func @fold_swizzle_hint_of_encoding() -> tensor<16xbf16,#encoding> {
%empty = tensor.empty() : tensor<8x16xbf16, #encoding>
%swizzle = iree_codegen.swizzle_hint %empty[#iree_codegen.rotate_rows<8, 4>] : tensor<8x16xbf16, #encoding>
%slice = tensor.extract_slice %swizzle[0, 0] [1, 16] [1, 1] : tensor<8x16xbf16, #encoding> to tensor<16xbf16,#encoding>
return %slice : tensor<16xbf16,#encoding>
}
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@ def IREECodegen_SwizzleHintOp : Op<IREECodegen_Dialect, "swizzle_hint", [
is otherwise perfectly legal.
}];

let arguments = (ins RankedTensorOrMemRefOf<[AnyType], [1]>:$operand,
let arguments = (ins AnyRankedTensorOrMemRef:$operand,
IREECodegen_AnySwizzleAttr:$swizzle);
let results = (outs RankedTensorOrMemRefOf<[AnyType], [1]>:$result);
let results = (outs AnyRankedTensorOrMemRef:$result);

let assemblyFormat = [{
$operand `[` $swizzle attr-dict `]` `:` type($result)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,24 +101,44 @@ static FailureOr<Value> createSharedAllocDestination(RewriterBase &rewriter,
return failure();
}

auto empty = forallOp.getDpsInits()[0].getDefiningOp<tensor::EmptyOp>();
// Skip swizzle hint ops.
Operation *destination = forallOp.getDpsInits()[0].getDefiningOp();
if (auto swizzleOp = dyn_cast<IREE::Codegen::SwizzleHintOp>(destination)) {
destination = swizzleOp->getOperand(0).getDefiningOp();
}

// Fail if the destination is not a `tensor.empty` op and cannot be trivially
// converted to a `bufferization.alloc_tensor`.
auto empty = dyn_cast<tensor::EmptyOp>(destination);
if (!empty) {
return failure();
}

// Create a `bufferization.alloc_tensor` op with memory space
// `#gpu.address_space<workgroup>`.
Location loc = empty->getLoc();
OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(empty);
Attribute sharedMemoryAddrSpace = gpu::AddressSpaceAttr::get(
rewriter.getContext(), gpu::GPUDialect::getWorkgroupAddressSpace());
auto allocTensor = bufferization::AllocTensorOp::create(
rewriter, empty->getLoc(), cast<TensorType>(empty.getResult().getType()),
rewriter, loc, cast<TensorType>(empty.getResult().getType()),
empty.getDynamicSizes(),
/*copy=*/Value(), /*size_hint=*/Value(),
/*memory_space=*/sharedMemoryAddrSpace);

// If the original `tensor.empty` has a swizzle hint, apply it to the new
// allocation. Note that if there is a swizzle hint, it will be the only user
// of the `tensor.empty` op.
if (auto swizzleHintOp =
dyn_cast<IREE::Codegen::SwizzleHintOp>(*empty->getUsers().begin())) {
assert(swizzleHintOp->hasOneUse() &&
"a tensor.empty op with a swizzle hint applied, should have the "
"swizzle hint as its only user");
auto newSwizzle = IREE::Codegen::SwizzleHintOp::create(
rewriter, loc, allocTensor.getResult(), swizzleHintOp.getSwizzle());
return newSwizzle.getResult();
}
return allocTensor.getResult();
}

Expand Down Expand Up @@ -2065,4 +2085,107 @@ void populateIREEGPULowerValueBarrierPatterns(RewritePatternSet &patterns) {
patterns.add<LowerValueBarrierPattern>(patterns.getContext());
}

//===----------------------------------------------------------------------===//
// SwizzleHintOp Fold Patterns
//===----------------------------------------------------------------------===//

// The following patterns are adapted from the populateFoldTensorEmptyPatterns
// in upstream llvm-project. The main change is to add support for folding with
// swizzle_hint ops from IREE. Once swizzle_hint ops are more widely used and
// proven stable, we could consider upstreaming this extension.

namespace {
struct FoldSwizzleHintOpWithExtractSliceOp final
: OpRewritePattern<tensor::ExtractSliceOp> {
using Base::Base;
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
PatternRewriter &rewriter) const override {
// Check for swizzle_hint op source.
auto swizzleHintOp =
sliceOp.getSource().getDefiningOp<IREE::Codegen::SwizzleHintOp>();
if (!swizzleHintOp) {
return failure();
}

// Check for tensor.empty source.
auto emptyOp = swizzleHintOp.getOperand().getDefiningOp<tensor::EmptyOp>();
if (!emptyOp) {
return failure();
}

// Check for single use.
if (!emptyOp->hasOneUse()) {
return failure();
}

// Create new tensor.empty op. tensor.extract_slice may be rank-reducing;
// its dynamic sizes must be preserved as well as its result type.
Location loc = sliceOp.getLoc();
auto sliceType = cast<RankedTensorType>(sliceOp.getType());
auto tensorType =
RankedTensorType::get(sliceType.getShape(), sliceType.getElementType(),
sliceType.getEncoding());
auto newEmptyOp =
tensor::EmptyOp::create(rewriter, loc, tensorType, sliceOp.getSizes());
rewriter.replaceOpWithNewOp<IREE::Codegen::SwizzleHintOp>(
sliceOp, newEmptyOp, swizzleHintOp.getSwizzle());
return success();
}
};

template <typename ReshapeOp>
struct FoldSwizzleHintOpWithReshapeOp final : OpRewritePattern<ReshapeOp> {
using OpRewritePattern<ReshapeOp>::OpRewritePattern;
LogicalResult matchAndRewrite(ReshapeOp reshapeOp,
PatternRewriter &rewriter) const override {
auto swizzleHintOp =
reshapeOp.getSrc()
.template getDefiningOp<IREE::Codegen::SwizzleHintOp>();
if (!swizzleHintOp) {
return failure();
}
auto emptyOp =
swizzleHintOp.getOperand().template getDefiningOp<tensor::EmptyOp>();
if (!emptyOp) {
return failure();
}

// Check for single use.
if (!emptyOp->hasOneUse()) {
return failure();
}

// Reify result shape.
Location loc = reshapeOp.getLoc();
ReifiedRankedShapedTypeDims resultShapes;
if (failed(reifyResultShapes(rewriter, reshapeOp, resultShapes)) ||
!llvm::hasSingleElement(resultShapes)) {
return failure();
}

// Create new tensor.empty op.
Value emptyTensor =
tensor::EmptyOp::create(rewriter, loc, resultShapes[0],
reshapeOp.getResultType().getElementType(),
reshapeOp.getResultType().getEncoding());
Value newSwizzleHintOp = IREE::Codegen::SwizzleHintOp::create(
rewriter, loc, emptyTensor, swizzleHintOp.getSwizzle());
if (newSwizzleHintOp.getType() != reshapeOp.getResultType()) {
rewriter.replaceOpWithNewOp<tensor::CastOp>(
reshapeOp, reshapeOp.getResultType(), newSwizzleHintOp);
} else {
rewriter.replaceOp(reshapeOp, newSwizzleHintOp);
}
return success();
}
};

} // namespace

void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns) {
patterns.add<FoldSwizzleHintOpWithReshapeOp<tensor::ExpandShapeOp>,
FoldSwizzleHintOpWithReshapeOp<tensor::CollapseShapeOp>,
FoldSwizzleHintOpWithExtractSliceOp>(patterns.getContext());
}

} // namespace mlir::iree_compiler::IREE::GPU
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ void populateIREEGPUVectorUnrollPatterns(
void populateIREEGPUVectorUnrollPatterns(RewritePatternSet &patterns);
void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns);

// Populate patterns to fold tensor.empty ops through swizzle hint ops.
void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns);

} // namespace mlir::iree_compiler::IREE::GPU

#endif // IREE_COMPILER_CODEGEN_DIALECT_GPU_TRANSFORMS_TRANSFORMS_H_
Loading