From 3686f80f52f13575d44fc69161ff968155c0d7b4 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Tue, 29 Apr 2025 18:18:00 +0000 Subject: [PATCH 1/2] Only prefetch 2D loads Signed-off-by: Tiotto, Ettore --- .../Pipeliner/MatmulLoopPipeline.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index e6bd165d3d..4cf9d0c375 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -133,8 +133,8 @@ static void collectOpsToPipeline(scf::ForOp forOp, if (!isBlockPtr && !supportRegularPtr) continue; - // Check if the memory is structed densely. If not, we do not prefetch it - // to avoid polluting the cache. + // In order to avoid polluting the cache, do not prefetch loads unless the + // memory they reference is densely structured. Attribute blockIOAttr = loadOp->getAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect:: getBlockIOAttrName()); @@ -143,6 +143,12 @@ static void collectOpsToPipeline(scf::ForOp forOp, continue; } + // Currently we can only prefetch 2D loads. + if (cast(loadOp.getType()).getRank() != 2) { + LDBG("Skipping LoadOp with non 2D tensor type" << *loadOp); + continue; + } + std::optional loadWithDotOperand = loadDotOperand(loadOp); if (loadWithDotOperand.has_value()) loadOps.push_back(loadWithDotOperand.value()); From 43eaf2abd9116b16065af6d035e25dfa0442d271 Mon Sep 17 00:00:00 2001 From: "Tiotto, Ettore" Date: Tue, 29 Apr 2025 21:18:28 +0000 Subject: [PATCH 2/2] Address code review comments Signed-off-by: Tiotto, Ettore --- test/TritonIntelGPU/loop-pipeline.mlir | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index 0cdf686a98..0103c05ec2 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -377,6 +377,40 @@ module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.sup } {tt.flatten} tt.return } +} +// ----- +// COM: Ensure prefetch operations aren't generated for 3D loads. +#linear = #ttg.linear<{register = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 8, 0], [0, 16, 0], [0, 0, 16], [0, 0, 32], [0, 64, 0]], lane = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [0, 0, 8]], warp = [[0, 0, 0], [0, 0, 0], [0, 32, 0]], block = []}> +#linear1 = #ttg.linear<{register = [[0, 0, 1], [0, 0, 2], [0, 0, 4], [0, 0, 8], [0, 16, 0], [0, 0, 16], [0, 0, 32], [0, 128, 0]], lane = [[0, 1, 0], [0, 2, 0], [0, 4, 0], [0, 8, 0]], warp = [[0, 32, 0], [0, 64, 0], [0, 0, 0]], block = []}> +#linear2 = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 8], [16, 0], [0, 16], [0, 32], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0]], warp = [[32, 0], [64, 0], [0, 0]], block = []}> +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [2, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}> +module attributes {triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 16 : i32} { + // CHECK-LABEL: batched_gemm_3d_tma_kernel + tt.func public @batched_gemm_3d_tma_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32, %arg6: i32 {tt.divisibility = 16 : i32}) { + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %c1_i64 = arith.constant 1 : i64 + %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> + %0 = tt.get_program_id x : i32 + %16 = arith.extsi %arg6 : i32 to i64 + %24 = arith.extsi %arg3 : i32 to i64 + %26:1 = scf.for %arg7 = %c0_i32 to %c64_i32 step %c1_i32 iter_args(%arg8 = %c1_i32) -> (i32) : i32 { + // CHECK-NOT: prefetch + %27 = arith.cmpi eq, %arg8, %c1_i32 : i32 + %29 = arith.select %27, %c0_i32, %arg8 : i32 + %33 = tt.make_tensor_ptr %arg0, [%24, %24, %16], [%16, %16, %c1_i64], [%arg8, %arg8, %29] {order = array} : > + %34 = tt.load %33 {triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %35 = tt.reshape %34 : tensor<1x128x64xf16, #linear> -> tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %36 = tt.make_tensor_ptr %arg1, [%24, %16, %16], [%16, %16, %c1_i64], [%arg8, %arg8, %29] {order = array} : > + %37 = tt.load %36 {triton_intel_gpu.block_io = "row_major"} : !tt.ptr> + %38 = tt.reshape %37 : tensor<1x256x64xf16, #linear1> -> tensor<256x64xf16, #linear2> + %39 = tt.trans %38 {order = array} : tensor<256x64xf16, #linear2> -> tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %40 = tt.dot %35, %39, %cst : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x256xf32, #mma> + scf.yield %29 : i32 + } + tt.return + } }