iree-org · Max191 · Jan 15, 2026 · Dec 11, 2025 · Jan 14, 2026
@@ -195,6 +195,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Common:FoldTensorExtractOpIncGen",
         "//compiler/src/iree/compiler/Codegen/Dialect/CPU/IR:IREECPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
+        "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Transforms:IREECodegenTransforms",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils:KnownTargets",

@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/Common/TensorDynamicDimAnalysis.h"
 #include "iree/compiler/Codegen/Common/Transforms.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Transforms.h"
@@ -318,6 +319,8 @@ void BlockDynamicDimensionsPass::runOnOperation() {
                                                       controlFusionFn);
     IREE::LinalgExt::populateFoldReshapeOpsByExpansionPatterns(patterns,
                                                                controlFusionFn);
+    IREE::Codegen::populateFoldReshapeOpsByExpansionPatterns(patterns,
+                                                             controlFusionFn);
     // Add patterns to fold `tensor.empty` operations with its consumers.
     tensor::populateFoldTensorEmptyPatterns(patterns);
     // Add some additional patterns that can simplify the IR.
@@ -367,6 +370,8 @@ void BlockDynamicDimensionsPass::runOnOperation() {
                                                       controlFn);
     IREE::LinalgExt::populateFoldReshapeOpsByExpansionPatterns(
         bubbleExpandShapePatterns, controlFn);
+    IREE::Codegen::populateFoldReshapeOpsByExpansionPatterns(
+        bubbleExpandShapePatterns, controlFn);
     // Add patterns to fold the "bubbled-up" `tensor.expand_shape` operation and
     // "pushed-down" `tensor.collapse_shape` operation with their interface
     // bindings or `tensor.empty` operations.

@@ -229,6 +229,7 @@ iree_cc_library(
     iree::compiler::Codegen::Common::FoldTensorExtractOpIncGen
     iree::compiler::Codegen::Dialect::CPU::IR::IREECPUDialect
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
+    iree::compiler::Codegen::Dialect::Codegen::Transforms::IREECodegenTransforms
     iree::compiler::Codegen::Dialect::Codegen::Utils
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Dialect::GPU::TargetUtils::KnownTargets

@@ -7,6 +7,7 @@
 #include "iree/compiler/Codegen/Common/Transforms.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -412,6 +413,8 @@ void PropagateReshapesByExpansionPass::runOnOperation() {
       };
   linalg::populateFoldReshapeOpsByExpansionPatterns(bubbleExpandShapePatterns,
                                                     bubbleUpExpansionControlFn);
+  IREE::Codegen::populateFoldReshapeOpsByExpansionPatterns(
+      bubbleExpandShapePatterns, bubbleUpExpansionControlFn);
   // Add patterns to do some additional cleanup (on top of canonicalizations
   // that can be done later) of reshape ops.
   tensor::populateFoldTensorEmptyPatterns(bubbleExpandShapePatterns);

@@ -459,3 +459,205 @@ func.func @no_swap_rank_reducing_slice(%arg0: tensor<3x6xi8>) -> tensor<3xi16> {
 // CHECK-SAME: %[[ARG0:[A-Za-z0-9]+]]: tensor<3x6xi8>
 // CHECK-NEXT: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
 // CHECK-NEXT: iree_tensor_ext.bitcast %[[SLICE]]
+
+// -----
+
+// Test propagating collapse_shape producer through inner_tiled op.
+// Using proper 2D matmul indexing maps with MFMA_F32_16x16x16_F16 layout.
+// Tensor shapes: LHS[outer_m, outer_k, 16, 16], RHS[outer_k, outer_n, 16, 16], ACC[outer_m, outer_n, 16, 16]
+#contraction_accesses = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+]
+func.func @propagate_collapse_through_inner_tiled(
+    %src: tensor<2x3x4x16x16xf16>, %rhs: tensor<4x2x16x16xf16>, %out: tensor<6x2x16x16xf32>)
+    -> tensor<6x2x16x16xf32> {
+  // Collapse the first two outer dims of LHS: [2,3] -> [6]
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2], [3], [4]]
+      : tensor<2x3x4x16x16xf16> into tensor<6x4x16x16xf16>
+  %result = iree_codegen.inner_tiled ins(%collapsed, %rhs) outs(%out) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    permutations = [array<i64: 0, 1>, array<i64: 1, 0>, array<i64: 0, 1>],
+    semantics = #iree_gpu.mma_semantics<distributed = false, opaque = true>
+  } : tensor<6x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<6x2x16x16xf32>
+  return %result : tensor<6x2x16x16xf32>
+}
+
+// CHECK-LABEL: func @propagate_collapse_through_inner_tiled
+// CHECK-SAME:    %[[SRC:[A-Za-z0-9]+]]: tensor<2x3x4x16x16xf16>
+// CHECK-SAME:    %[[RHS:[A-Za-z0-9]+]]: tensor<4x2x16x16xf16>
+// CHECK-SAME:    %[[OUT:[A-Za-z0-9]+]]: tensor<6x2x16x16xf32>
+// CHECK:         %[[EXPANDED_OUT:.+]] = tensor.expand_shape %[[OUT]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        : tensor<6x2x16x16xf32> into tensor<2x3x2x16x16xf32>
+// CHECK:         %[[INNER_TILED:.+]] = iree_codegen.inner_tiled
+// CHECK-SAME:        ins(%[[SRC]], %[[RHS]])
+// CHECK-SAME:        outs(%[[EXPANDED_OUT]])
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>]
+// CHECK-SAME:        iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:        : tensor<2x3x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<2x3x2x16x16xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[INNER_TILED]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        : tensor<2x3x2x16x16xf32> into tensor<6x2x16x16xf32>
+// CHECK:         return %[[COLLAPSED]]
+
+// -----
+
+// Test propagating expand_shape consumer through inner_tiled op.
+#contraction_accesses2 = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+]
+func.func @propagate_expand_through_inner_tiled(
+    %lhs: tensor<6x4x16x16xf16>, %rhs: tensor<4x2x16x16xf16>, %out: tensor<6x2x16x16xf32>)
+    -> tensor<2x3x2x16x16xf32> {
+  %result = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%out) {
+    indexing_maps = #contraction_accesses2,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    permutations = [array<i64: 0, 1>, array<i64: 1, 0>, array<i64: 0, 1>],
+    semantics = #iree_gpu.mma_semantics<distributed = false, opaque = true>
+  } : tensor<6x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<6x2x16x16xf32>
+  %expanded = tensor.expand_shape %result [[0, 1], [2], [3], [4]]
+      output_shape [2, 3, 2, 16, 16] : tensor<6x2x16x16xf32> into tensor<2x3x2x16x16xf32>
+  return %expanded : tensor<2x3x2x16x16xf32>
+}
+
+// CHECK-LABEL: func @propagate_expand_through_inner_tiled
+// CHECK-SAME:    %[[LHS:[A-Za-z0-9]+]]: tensor<6x4x16x16xf16>
+// CHECK-SAME:    %[[RHS:[A-Za-z0-9]+]]: tensor<4x2x16x16xf16>
+// CHECK-SAME:    %[[OUT:[A-Za-z0-9]+]]: tensor<6x2x16x16xf32>
+// CHECK-DAG:     %[[EXPANDED_OUT:.+]] = tensor.expand_shape %[[OUT]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        : tensor<6x2x16x16xf32> into tensor<2x3x2x16x16xf32>
+// CHECK-DAG:     %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        : tensor<6x4x16x16xf16> into tensor<2x3x4x16x16xf16>
+// CHECK:         %[[INNER_TILED:.+]] = iree_codegen.inner_tiled
+// CHECK-SAME:        ins(%[[EXPANDED_LHS]], %[[RHS]])
+// CHECK-SAME:        outs(%[[EXPANDED_OUT]])
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d3, d2)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]
+// CHECK-SAME:        iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:        : tensor<2x3x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<2x3x2x16x16xf32>
+// CHECK:         return %[[INNER_TILED]]
+
+// -----
+
+// Test that reshape touching inner dimensions is NOT propagated.
+#contraction_accesses3 = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+]
+func.func @no_propagate_inner_dim_reshape(
+    %src: tensor<6x4x16x2x8xf16>, %rhs: tensor<4x2x16x16xf16>, %out: tensor<6x2x16x16xf32>)
+    -> tensor<6x2x16x16xf32> {
+  // Collapsing inner dims [3,4] which are part of inner tile - should NOT propagate.
+  %collapsed = tensor.collapse_shape %src [[0], [1], [2], [3, 4]]
+      : tensor<6x4x16x2x8xf16> into tensor<6x4x16x16xf16>
+  %result = iree_codegen.inner_tiled ins(%collapsed, %rhs) outs(%out) {
+    indexing_maps = #contraction_accesses3,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    permutations = [array<i64: 0, 1>, array<i64: 1, 0>, array<i64: 0, 1>],
+    semantics = #iree_gpu.mma_semantics<distributed = false, opaque = true>
+  } : tensor<6x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<6x2x16x16xf32>
+  return %result : tensor<6x2x16x16xf32>
+}
+
+// CHECK-LABEL: func @no_propagate_inner_dim_reshape
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape
+// CHECK:         iree_codegen.inner_tiled ins(%[[COLLAPSED]],
+
+// -----
+
+// Test propagating collapse_shape producer through inner_tiled op with dynamic outer shapes.
+#contraction_accesses_dyn1 = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+]
+func.func @propagate_collapse_through_inner_tiled_dynamic(
+    %src: tensor<?x3x4x16x16xf16>, %rhs: tensor<4x2x16x16xf16>, %out: tensor<?x2x16x16xf32>)
+    -> tensor<?x2x16x16xf32> {
+  // Collapse the first two outer dims of LHS: [?, 3] -> [?*3]
+  %collapsed = tensor.collapse_shape %src [[0, 1], [2], [3], [4]]
+      : tensor<?x3x4x16x16xf16> into tensor<?x4x16x16xf16>
+  %result = iree_codegen.inner_tiled ins(%collapsed, %rhs) outs(%out) {
+    indexing_maps = #contraction_accesses_dyn1,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    permutations = [array<i64: 0, 1>, array<i64: 1, 0>, array<i64: 0, 1>],
+    semantics = #iree_gpu.mma_semantics<distributed = false, opaque = true>
+  } : tensor<?x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<?x2x16x16xf32>
+  return %result : tensor<?x2x16x16xf32>
+}
+
+// CHECK-LABEL: func @propagate_collapse_through_inner_tiled_dynamic
+// CHECK-SAME:    %[[SRC:[A-Za-z0-9]+]]: tensor<?x3x4x16x16xf16>
+// CHECK-SAME:    %[[RHS:[A-Za-z0-9]+]]: tensor<4x2x16x16xf16>
+// CHECK-SAME:    %[[OUT:[A-Za-z0-9]+]]: tensor<?x2x16x16xf32>
+// CHECK-DAG:     %[[DIM:.+]] = tensor.dim %[[SRC]], %c0
+// CHECK:         %[[EXPANDED_OUT:.+]] = tensor.expand_shape %[[OUT]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        output_shape [%[[DIM]], 3, 2, 16, 16]
+// CHECK-SAME:        : tensor<?x2x16x16xf32> into tensor<?x3x2x16x16xf32>
+// CHECK:         %[[INNER_TILED:.+]] = iree_codegen.inner_tiled
+// CHECK-SAME:        ins(%[[SRC]], %[[RHS]])
+// CHECK-SAME:        outs(%[[EXPANDED_OUT]])
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d2, d3)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>]
+// CHECK-SAME:        iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:        : tensor<?x3x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<?x3x2x16x16xf32>
+// CHECK:         %[[COLLAPSED:.+]] = tensor.collapse_shape %[[INNER_TILED]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        : tensor<?x3x2x16x16xf32> into tensor<?x2x16x16xf32>
+// CHECK:         return %[[COLLAPSED]]
+
+// -----
+
+// Test propagating expand_shape consumer through inner_tiled op with dynamic outer shapes.
+#contraction_accesses_dyn2 = [
+ affine_map<(m, n, k) -> (m, k)>,
+ affine_map<(m, n, k) -> (k, n)>,
+ affine_map<(m, n, k) -> (m, n)>
+]
+func.func @propagate_expand_through_inner_tiled_dynamic(
+    %lhs: tensor<?x4x16x16xf16>, %rhs: tensor<4x2x16x16xf16>, %out: tensor<?x2x16x16xf32>,
+    %dyn_dim: index)
+    -> tensor<?x3x2x16x16xf32> {
+  %result = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%out) {
+    indexing_maps = #contraction_accesses_dyn2,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    permutations = [array<i64: 0, 1>, array<i64: 1, 0>, array<i64: 0, 1>],
+    semantics = #iree_gpu.mma_semantics<distributed = false, opaque = true>
+  } : tensor<?x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<?x2x16x16xf32>
+  %expanded = tensor.expand_shape %result [[0, 1], [2], [3], [4]]
+      output_shape [%dyn_dim, 3, 2, 16, 16] : tensor<?x2x16x16xf32> into tensor<?x3x2x16x16xf32>
+  return %expanded : tensor<?x3x2x16x16xf32>
+}
+
+// CHECK-LABEL: func @propagate_expand_through_inner_tiled_dynamic
+// CHECK-SAME:    %[[LHS:[A-Za-z0-9]+]]: tensor<?x4x16x16xf16>
+// CHECK-SAME:    %[[RHS:[A-Za-z0-9]+]]: tensor<4x2x16x16xf16>
+// CHECK-SAME:    %[[OUT:[A-Za-z0-9]+]]: tensor<?x2x16x16xf32>
+// CHECK-SAME:    %[[DYN_DIM:[A-Za-z0-9]+]]: index
+// CHECK-DAG:     %[[EXPANDED_OUT:.+]] = tensor.expand_shape %[[OUT]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        output_shape [%[[DYN_DIM]], 3, 2, 16, 16]
+// CHECK-SAME:        : tensor<?x2x16x16xf32> into tensor<?x3x2x16x16xf32>
+// CHECK-DAG:     %[[EXPANDED_LHS:.+]] = tensor.expand_shape %[[LHS]] {{\[}}[0, 1], [2], [3], [4]{{\]}}
+// CHECK-SAME:        output_shape [%[[DYN_DIM]], 3, 4, 16, 16]
+// CHECK-SAME:        : tensor<?x4x16x16xf16> into tensor<?x3x4x16x16xf16>
+// CHECK:         %[[INNER_TILED:.+]] = iree_codegen.inner_tiled
+// CHECK-SAME:        ins(%[[EXPANDED_LHS]], %[[RHS]])
+// CHECK-SAME:        outs(%[[EXPANDED_OUT]])
+// CHECK-SAME:        indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d3, d2)>,
+// CHECK-SAME:                         affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>]
+// CHECK-SAME:        iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:        : tensor<?x3x4x16x16xf16>, tensor<4x2x16x16xf16> into tensor<?x3x2x16x16xf32>
+// CHECK:         return %[[INNER_TILED]]
@@ -0,0 +1,35 @@
+# Copyright 2026 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+load("//build_tools/bazel:build_defs.oss.bzl", "iree_compiler_cc_library")
+
+package(
+    default_visibility = ["//visibility:public"],
+    features = ["layering_check"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+iree_compiler_cc_library(
+    name = "IREECodegenTransforms",
+    srcs = [
+        "ReshapeFusion.cpp",
+    ],
+    hdrs = [
+        "Transforms.h",
+    ],
+    deps = [
+        "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DialectUtils",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TensorDialect",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
@@ -0,0 +1,33 @@
+################################################################################
+# Autogenerated by build_tools/bazel_to_cmake/bazel_to_cmake.py from           #
+# compiler/src/iree/compiler/Codegen/Dialect/Codegen/Transforms/BUILD.bazel    #
+#                                                                              #
+# Use iree_cmake_extra_content from iree/build_defs.oss.bzl to add arbitrary   #
+# CMake-only content.                                                          #
+#                                                                              #
+# To disable autogeneration for this file entirely, delete this header.        #
+################################################################################
+
+iree_add_all_subdirs()
+
+iree_cc_library(
+  NAME
+    IREECodegenTransforms
+  HDRS
+    "Transforms.h"
+  SRCS
+    "ReshapeFusion.cpp"
+  DEPS
+    LLVMSupport
+    MLIRIR
+    MLIRLinalgDialect
+    MLIRLinalgTransforms
+    MLIRSupport
+    MLIRTensorDialect
+    MLIRTransformUtils
+    MLIRTransforms
+    iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
+  PUBLIC
+)
+
+### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###