diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp index 086f1a4555db..a7f88042d3df 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTensorTile.cpp @@ -144,6 +144,9 @@ class TileConsumerAndFuseInputProducer final auto tilingOp = sliceOp.getSource().getDefiningOp(); if (!tilingOp) continue; + if (isa(sliceOp.getSource().getDefiningOp())) { + continue; + } // Restrict to fully parallel ops for now for simplicity. auto isParallel = [](utils::IteratorType it) { return linalg::isParallelIterator(it); diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTile.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTile.cpp index 6380fa0fba7c..b8019278836d 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTile.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUTile.cpp @@ -117,11 +117,18 @@ static LogicalResult tileAndDistributeToThreads(TilingInterface consumerOp, IRRewriter rewriter(context); SmallVector tileSizesOfr = getAsIndexOpFoldResult(context, tileSizes); + scf::SCFTilingOptions tilingOptions; + tilingOptions.setTileSizes(tileSizesOfr); + scf::SCFTileAndFuseOptions tileAndFuseOptions; + tileAndFuseOptions.setTilingOptions(tilingOptions); + tileAndFuseOptions.setFusionControlFn( + [](tensor::ExtractSliceOp sliceOp, OpResult origProducer, + bool isDestinationOperand) -> std::tuple { + return {!isa(origProducer.getOwner()), false}; + }); FailureOr tileAndFuseResult = - scf::tileConsumerAndFuseProducersUsingSCF( - rewriter, consumerOp, - scf::SCFTileAndFuseOptions().setTilingOptions( - scf::SCFTilingOptions().setTileSizes(tileSizesOfr))); + scf::tileConsumerAndFuseProducersUsingSCF(rewriter, consumerOp, + tileAndFuseOptions); if (failed(tileAndFuseResult)) { return consumerOp.emitOpError("failed tiling and fusing producers"); diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp index 154ab37ecc03..659949ab9d06 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/DispatchABI.cpp @@ -546,7 +546,7 @@ HALDispatchABI::buildScopeAttr(mlir::ModuleOp moduleOp, /*scopeline=*/1, LLVM::DISubprogramFlags::Definition | LLVM::DISubprogramFlags::Optimized, - subroutineTypeAttr); + subroutineTypeAttr, /*retainedNodes =*/{}); } // Returns the most local DISubprogramAttr starting from |forOp|. diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp index 40c8314ba7f5..ce7e3b7d5fc6 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileAndFuse.cpp @@ -183,6 +183,9 @@ LogicalResult applyTileAndFuse(RewriterBase &rewriter, Operation *rootOp, // Traverse the slices in BFS fashion. tensor::ExtractSliceOp candidateSliceOp = candidates.front(); candidates.pop_front(); + if (candidateSliceOp.getSource().getDefiningOp()) { + continue; + } // Materialize the slice of the producer in place. std::optional fusedProducer = diff --git a/third_party/llvm-project b/third_party/llvm-project index 085448c918aa..f6935c777f67 160000 --- a/third_party/llvm-project +++ b/third_party/llvm-project @@ -1 +1 @@ -Subproject commit 085448c918aa3b730cdd3e497892cfeff0ed60a6 +Subproject commit f6935c777f675490ecb2327887dbac5c7d7fce1f