iree-org · Max191 · Jan 16, 2026 · Jan 5, 2026 · Jan 16, 2026 · Jan 16, 2026
@@ -7,10 +7,12 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/DerivedConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Dialect/LinalgExt/Utils/MatchUtils.h"
 #include "iree/compiler/Utils/EncodingUtils.h"
 #include "iree/compiler/Utils/Indexing.h"
@@ -782,6 +784,59 @@ MMAAttr::buildUnderlyingOperations(OpBuilder &builder, Location loc,
   return failure();
 }
 
+/// Creates index_hint ops wrapping delinearized lane ID values.
+/// The `delinearizedLaneId` values come from delinearizing the lane ID using
+/// `basis`, with the innermost/fastest-varying dimension last.
+///
+/// Non-final indices get lane_constant hints (uniform across lane groups).
+/// The final index gets lane_increment hint (increments within lane group).
+/// The group size is derived from the innermost basis element.
+/// Indices with a unit basis are ignored, and given a lane_constant hint.
+static SmallVector<Value>
+createTransposeLoadIndexHint(OpBuilder &builder, Location loc,
+                             ValueRange delinearizedLaneId,
+                             ArrayRef<int64_t> basis) {
+  // Need at least 2 dimensions for transpose load pattern.
+  if (delinearizedLaneId.size() < 2) {
+    return SmallVector<Value>(delinearizedLaneId.begin(),
+                              delinearizedLaneId.end());
+  }
+
+  // Find the index of the innermost non-unit (> 1) basis element.
+  // This determines which result gets the lane-increment hint.
+  // Size-1 dimensions produce constant 0 outputs regardless of lane ID,
+  // so they don't contribute to the meaningful group structure.
+  int64_t groupSize = 1;
+  size_t incrementResultIdx = delinearizedLaneId.size() - 1;
+  // The delinearized indices could have N or N + 1 results, and the basis
+  // elements are aligned with the last N results, so iterate backwards
+  // together.
+  for (size_t i = 1; i <= basis.size(); ++i) {
+    groupSize = basis[basis.size() - i];
-  for (size_t i = 1; i <= basis.size(); ++i) {
-    groupSize = basis[basis.size() - i];
+  for (size_t i = 1, e = basis.size(); i <= e; ++i) {
+    groupSize = basis[e - i];
-  for (size_t i = 1; i <= basis.size(); ++i) {
-    groupSize = basis[basis.size() - i];
+  for (size_t i = 1, e = basis.size(); i <= e; ++i) {
+    groupSize = basis[e - i];
+    incrementResultIdx = delinearizedLaneId.size() - i;
+    if (groupSize > 1) {
+      break;
+    }
+  }
+
+  auto laneConstantAttr =
+      IREE::GPU::LaneConstantAttr::get(builder.getContext(), groupSize);
+  auto laneIncrementAttr = IREE::GPU::LaneIncrementAttr::get(
+      builder.getContext(), groupSize, /*step=*/1);
+
+  SmallVector<Value> results;
+  for (auto [i, value] : llvm::enumerate(delinearizedLaneId)) {
+    // The result corresponding to innermost non-unit basis gets lane-increment;
+    // all other results get lane-constant hints.
+    Attribute hint = (i == incrementResultIdx) ? Attribute(laneIncrementAttr)
+                                               : Attribute(laneConstantAttr);
+    auto hintOp = IREE::Codegen::IndexHintOp::create(builder, loc, value, hint);
+    results.push_back(hintOp.getResult());
+  }
+
+  return results;
+}
+
 static LogicalResult populateCanonicalOffsetsSizesAndStrides(
     OpBuilder &builder, Location loc, Value laneId,
     ArrayRef<int64_t> permutation, MMASingleSubgroupLayout subgroupLayout,
@@ -819,6 +874,12 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides(
   auto splitLaneId = affine::AffineDelinearizeIndexOp::create(
       builder, loc, laneId, vtidBasis, /*hasOuterBound=*/false);
 
+  // Wrap delinearize results with index_hint ops for transpose load.
+  // The delinearize results are already in the correct order
+  // (innermost/fastest-varying dimension is last).
+  SmallVector<Value> hintedSplitLaneId = createTransposeLoadIndexHint(
+      builder, loc, splitLaneId.getResults(), vtidBasis);
+
   // Each thread grabs `element` contiguous data, so the vtid needs to be
   // multiplied by `element` to get the next bunch of data.
   // vtid: virtual thread id
@@ -830,7 +891,7 @@ static LogicalResult populateCanonicalOffsetsSizesAndStrides(
   // worsen the generated code quality.
   for (auto [splitResultIdx, element] :
        llvm::zip_equal(dimToVtid, subgroupLayout.element)) {
-    Value vtid = splitLaneId.getResult(splitResultIdx);
+    Value vtid = hintedSplitLaneId[splitResultIdx];
     int64_t vtidLen = vtidBasis[splitResultIdx - 1];
     if (element != 1) {
       vtid = affine::AffineLinearizeIndexOp::create(

@@ -35,17 +35,19 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32>
 //       CHECK:   scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xf32>)
 //       CHECK:     %[[ID:.+]]:3 = affine.delinearize_index %[[LANE_ID]] into (4, 16)
-//       CHECK:     %[[ID1:.+]]  = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4)
-//       CHECK:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[ID1]]]
+//       CHECK:     %[[ROW:.+]] = iree_codegen.index_hint %[[ID]]#1(#iree_gpu.lane_constant<16>) : index
+//       CHECK:     %[[COL:.+]] = iree_codegen.index_hint %[[ID]]#2(#iree_gpu.lane_increment<16>) : index
+//       CHECK:     %[[ID1:.+]] = affine.linearize_index disjoint [%[[ROW]], %c0] by (4, 4)
+//       CHECK:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[COL]], %[[ID1]]]
 //  CHECK-SAME:       [2, 2, 1, 4] [1, 1, 1, 1] : tensor<2x2x16x16xf16> to tensor<2x2x1x4xf16>
-//       CHECK:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID1]], %[[ID]]#2]
+//       CHECK:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID1]], %[[COL]]]
 //  CHECK-SAME:       [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xf16> to tensor<2x2x4x1xf16>
-//       CHECK:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]#2]
+//       CHECK:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1]], %[[COL]]]
 //  CHECK-SAME:       [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xf32> to tensor<2x2x4x1xf32>
 //       CHECK:     %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS_SLICE]], %[[RHS_SLICE]]) outs(%[[ACC_SLICE]])
 //  CHECK-SAME:       : tensor<2x2x1x4xf16>, tensor<2x2x4x1xf16> into tensor<2x2x4x1xf32>
 //       CHECK:     scf.forall.in_parallel
-//       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]#2]
+//       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[COL]]]
 //  CHECK-SAME:         [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xf32> into tensor<2x2x16x16xf32>
 //       CHECK:   mapping = [#iree_gpu.lane_id<0>]
 
@@ -87,17 +89,19 @@ module attributes { transform.with_named_sequence } {
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32>
 //       CHECK:   scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xi32>)
 //       CHECK:     %[[ID:.+]]:3  = affine.delinearize_index %[[LANE_ID]] into (4, 16)
-//       CHECK:     %[[ID1:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 8)
-//       CHECK:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[ID1]]]
+//       CHECK:     %[[ROW:.+]] = iree_codegen.index_hint %[[ID]]#1(#iree_gpu.lane_constant<16>) : index
+//       CHECK:     %[[COL:.+]] = iree_codegen.index_hint %[[ID]]#2(#iree_gpu.lane_increment<16>) : index
+//       CHECK:     %[[ID1:.+]] = affine.linearize_index disjoint [%[[ROW]], %c0] by (4, 8)
+//       CHECK:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[COL]], %[[ID1]]]
 //  CHECK-SAME:       [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
-//       CHECK:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[ID1]]]
+//       CHECK:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[COL]], %[[ID1]]]
 //  CHECK-SAME:       [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
-//       CHECK:     %[[ID1_2:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (4, 4)
-//       CHECK:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[ID]]#2]
+//       CHECK:     %[[ID1_2:.+]] = affine.linearize_index disjoint [%[[ROW]], %c0] by (4, 4)
+//       CHECK:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[COL]]]
 //  CHECK-SAME:       [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xi32> to tensor<2x2x4x1xi32>
 //       CHECK:     %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS_SLICE]], %[[RHS_SLICE]]) outs(%[[ACC_SLICE]])
 //  CHECK-SAME:       : tensor<2x2x1x8xi8>, tensor<2x2x1x8xi8> into tensor<2x2x4x1xi32>
 //       CHECK:     scf.forall.in_parallel
-//       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[ID]]#2]
+//       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1_2]], %[[COL]]]
 //  CHECK-SAME:         [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xi32> into tensor<2x2x16x16xi32>
 //       CHECK:   mapping = [#iree_gpu.lane_id<0>]