From 7a815f8ba06fddc894aac7100232bedf4dd19d76 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Thu, 24 Apr 2025 16:57:48 +0000
Subject: [PATCH 1/2] Limit prefetch to only densed memory

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 test/TritonIntelGPU/loop-pipeline.mlir             | 12 ++++++------
 test/TritonIntelGPU/split-barrier.mlir             |  8 ++++----
 .../Pipeliner/MatmulLoopPipeline.cpp               | 14 ++++++++++++++
 3 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir
index 3624fcd166..4635fedea1 100644
--- a/test/TritonIntelGPU/loop-pipeline.mlir
+++ b/test/TritonIntelGPU/loop-pipeline.mlir
@@ -100,11 +100,11 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
       %74 = tt.splat %73 : i32 -> tensor<1x32xi32, #blocked>
       %75 = arith.cmpi slt, %33, %74 : tensor<1x32xi32, #blocked>
       %76 = tt.broadcast %75 : tensor<1x32xi1, #blocked> -> tensor<64x32xi1, #blocked>
-      %77 = tt.load %arg11, %76, %cst_0 : tensor<64x32x!tt.ptr<f16>, #blocked>
+      %77 = tt.load %arg11, %76, %cst_0 {triton_intel_gpu.block_io = "row_major"} : tensor<64x32x!tt.ptr<f16>, #blocked>
       %78 = tt.splat %73 : i32 -> tensor<32x1xi32, #blocked1>
       %79 = arith.cmpi slt, %40, %78 : tensor<32x1xi32, #blocked1>
       %80 = tt.broadcast %79 : tensor<32x1xi1, #blocked1> -> tensor<32x256xi1, #blocked1>
-      %81 = tt.load %arg12, %80, %cst_1 : tensor<32x256x!tt.ptr<f16>, #blocked1>
+      %81 = tt.load %arg12, %80, %cst_1 {triton_intel_gpu.block_io = "row_major"} : tensor<32x256x!tt.ptr<f16>, #blocked1>
       %82 = ttg.convert_layout %77 : tensor<64x32xf16, #blocked> -> tensor<64x32xf16, #dot0>
       %83 = ttg.convert_layout %81 : tensor<32x256xf16, #blocked1> -> tensor<32x256xf16, #dot1>
       %84 = tt.dot %82, %83, %arg10, inputPrecision = tf32 : tensor<64x32xf16, #dot0> * tensor<32x256xf16, #dot1> -> tensor<64x256xf32, #dpas>
@@ -175,8 +175,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // CHECK:        tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
-      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<128x64xf16, #dot0>>
-      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #dot1>>
+      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<128x64xf16, #dot0>>
+      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<64x256xf16, #dot1>>
       %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas>
       %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : <tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>>
       %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : <tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
@@ -248,8 +248,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // CHECK:        tt.dot {{.*}} : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<128x256xf32, #[[$DPAS]]>
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
-      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<128x64xf16, #dot0>>
-      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #dot1>>
+      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<128x64xf16, #dot0>>
+      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<64x256xf16, #dot1>>
       %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas>
       %102 = tt.addptr %arg8, %c4_i32 : !tt.ptr<i32>, i32
       %100 = arith.addi %c0_i32, %c4_i32 : i32
diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir
index 9edb69774c..db559c3e8b 100644
--- a/test/TritonIntelGPU/split-barrier.mlir
+++ b/test/TritonIntelGPU/split-barrier.mlir
@@ -33,8 +33,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg2 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg3 = %cst, %arg4 = %18, %arg5 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
       %55:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
-      	    %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<128x64xf16, #dot0>>
-      	    %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #dot1>>
+      	    %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<128x64xf16, #dot0>>
+      	    %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<64x256xf16, #dot1>>
       	    %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas>
       	    %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : <tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>>
       	    %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : <tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
@@ -79,8 +79,8 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     // SUBGROUP_SCOPE:   spirv.INTEL.ControlBarrierWait <Subgroup> <Subgroup> <None>
     // CHECK-NEXT:   scf.yield
     %23:3 = scf.for %arg9 = %c0_i32 to %c64_i32 step %c64_i32 iter_args(%arg10 = %cst, %arg11 = %18, %arg12 = %22) -> (tensor<128x256xf32, #dpas>, !tt.ptr<tensor<128x64xf16, #dot0>>, !tt.ptr<tensor<64x256xf16, #dot1>>)  : i32 {
-      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<128x64xf16, #dot0>>
-      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #dot1>>
+      %56 = tt.load %arg11 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<128x64xf16, #dot0>>
+      %57 = tt.load %arg12 {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major"} : !tt.ptr<tensor<64x256xf16, #dot1>>
       %58 = tt.dot %56, %57, %arg10, inputPrecision = tf32 : tensor<128x64xf16, #dot0> * tensor<64x256xf16, #dot1> -> tensor<128x256xf32, #dpas>
       %59 = tt.advance %arg11, [%c0_i32, %c64_i32] : <tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #dpas, kWidth = 1}>>>
       %60 = tt.advance %arg12, [%c64_i32, %c0_i32] : <tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
index 031dd61824..84305abe0e 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -132,6 +132,20 @@ static void collectOpsToPipeline(scf::ForOp forOp,
       if (!isBlockPtr && !supportRegularPtr)
         continue;
 
+      // Check if the memory is structed densely. If not, we do not prefetch it
+      // to avoid pollute cache.
+      Attribute blockIOAttr =
+          loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect::
+                              getBlockIOAttrName());
+      if (!blockIOAttr) {
+        LLVM_DEBUG({
+          DBGS() << "skip the Load op to pipeline because the memory is not "
+                    "structured densely: ";
+          DBGS() << "  " << loadOp << "\n";
+        });
+        continue;
+      }
+
       std::optional<LoadDotOperand> loadWithDotOperand = loadDotOperand(loadOp);
       if (loadWithDotOperand.has_value())
         loadOps.push_back(loadWithDotOperand.value());

From 043ede7059137403dc675fbe1c572f0c609d55ed Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Thu, 24 Apr 2025 20:22:15 +0000
Subject: [PATCH 2/2] address review comments

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 .../Pipeliner/MatmulLoopPipeline.cpp                      | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
index 84305abe0e..0211318cf3 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -133,16 +133,12 @@ static void collectOpsToPipeline(scf::ForOp forOp,
         continue;
 
       // Check if the memory is structed densely. If not, we do not prefetch it
-      // to avoid pollute cache.
+      // to avoid polluting the cache.
       Attribute blockIOAttr =
           loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect::
                               getBlockIOAttrName());
       if (!blockIOAttr) {
-        LLVM_DEBUG({
-          DBGS() << "skip the Load op to pipeline because the memory is not "
-                    "structured densely: ";
-          DBGS() << "  " << loadOp << "\n";
-        });
+        LDBG("Skipping LoadOp without block_io attribute" << *loadOp);
         continue;
       }