From 1e9fd7e1bffa50c6495a90b9802a6d0fdf1200a3 Mon Sep 17 00:00:00 2001 From: "Lu,Chengjun" Date: Mon, 10 Mar 2025 15:23:11 +0000 Subject: [PATCH 1/2] Fix issue in prefetching column major matrix. Signed-off-by: Lu,Chengjun --- test/TritonIntelGPU/prefetch-to-llvm.mlir | 10 +++++----- .../lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/test/TritonIntelGPU/prefetch-to-llvm.mlir b/test/TritonIntelGPU/prefetch-to-llvm.mlir index 5d349716dd..1cd0c5d14c 100644 --- a/test/TritonIntelGPU/prefetch-to-llvm.mlir +++ b/test/TritonIntelGPU/prefetch-to-llvm.mlir @@ -73,10 +73,10 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} // CHECK: %[[SUB_GROUP_ID_RAW:.*]] = llvm.call spir_funccc @_Z16get_sub_group_id() {no_unwind, will_return} : () -> i32 // CHECK: %[[SUB_GROUP_ID_EXT:.*]] = llvm.zext %[[SUB_GROUP_ID_RAW]] : i32 to i64 // CHECK: %[[SUB_GROUP_ID:.*]] = llvm.trunc %[[SUB_GROUP_ID_EXT]] : i64 to i32 - // CHECK: %[[VAL_18:.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[VAL_18:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK: %[[VAL_19:.*]] = llvm.urem %[[SUB_GROUP_ID]], %[[VAL_18]] : i32 // CHECK: %[[VAL_20:.*]] = llvm.udiv %[[SUB_GROUP_ID]], %[[VAL_18]] : i32 - // CHECK: %[[CST_8:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[CST_8:.*]] = llvm.mlir.constant(8 : i32) : i32 // CHECK: %[[VAL_22:.*]] = llvm.urem %[[VAL_20]], %[[CST_8]] : i32 // CHECK: %[[VAL_23:.*]] = llvm.udiv %[[VAL_20]], %[[CST_8]] : i32 // CHECK: %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)> @@ -94,20 +94,20 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32} // CHECK: %[[VAL_24:.*]] = llvm.mul %[[COL_STRIDE_i64]], %[[CST_2]] : i64 // CHECK: %[[COL_MAJOR_PITCH:.*]] = llvm.trunc %[[VAL_24]] : i64 to i32 // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[CST_32:.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK: %[[VAL_26:.*]] = llvm.mul %[[VAL_19]], %[[CST_32]] : i32 // CHECK: %[[VAL_27:.*]] = llvm.add %[[VAL_26]], %[[CST_0]] : i32 // CHECK: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 // CHECK: %[[VAL_28:.*]] = llvm.urem %[[VAL_27]], %[[CST_32]] : i32 // CHECK: %[[COL_MAJOR_OFFSET_X:.*]] = llvm.add %[[VAL_28]], %[[OFFSET_1]] : i32 // CHECK: %[[CST_0:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: %[[CST_2:.*]] = llvm.mlir.constant(4 : i32) : i32 + // CHECK: %[[CST_2:.*]] = llvm.mlir.constant(2 : i32) : i32 // CHECK: %[[VAL_30:.*]] = llvm.mul %[[VAL_22]], %[[CST_2]] : i32 // CHECK: %[[VAL_31:.*]] = llvm.add %[[VAL_30]], %[[CST_0]] : i32 // CHECK: %[[CST_16:.*]] = llvm.mlir.constant(16 : i32) : i32 // CHECK: %[[VAL_32:.*]] = llvm.urem %[[VAL_31]], %[[CST_16]] : i32 // CHECK: %[[COL_MAJOR_OFFSET_Y:.*]] = llvm.add %[[VAL_32]], %[[OFFSET_0]] : i32 - // CHECK: triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 4, v_blocks = 1, cache_control = L1C_L3C} + // CHECK: triton_gen.2Dblockprefetch %[[BASE_]], %[[COL_MAJOR_BASE_WIDTH]], %[[COL_MAJOR_BASE_HEIGHT]], %[[COL_MAJOR_PITCH]], %[[COL_MAJOR_OFFSET_X]], %[[COL_MAJOR_OFFSET_Y]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 2, v_blocks = 2, cache_control = L1C_L3C} %columnMajorPtr = tt.make_tensor_ptr %arg0, [%arg4, %arg2], [%c1_i64, %arg5], [%c0_i32, %c0_i32] {order = array} : > ttig.prefetch %columnMajorPtr {cache = 1 : i32, evict = 1 : i32, isVolatile = false, ttig.block_io = "column_major"} : !tt.ptr> diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 6e3cc4b6b5..4922a31620 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -438,6 +438,10 @@ struct PrefetchOpConversion // Swap the shape to make it row major and then get the tiling // size base on row major shape. std::swap(tensorShape[0], tensorShape[1]); + + // Create the new tensor type with swapped row and col. + tensorType = RankedTensorType::get( + tensorShape, tensorType.getElementType(), tensorType.getEncoding()); } unsigned numWarps = triton::gpu::lookupNumWarps(op); From 8a4287d91f4e8ebc5af99f22909bb485a9276667 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Fri, 11 Jul 2025 14:13:35 +0000 Subject: [PATCH 2/2] [TritonGEN] Lower to GenISA for `2d_block_prefetch_16b_16r8x1c` Signed-off-by: Whitney Tsang --- .../tritongen-2Dblockprefetch-to-llvm.mlir | 14 ++++++++++++++ .../lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir index ee4c3ad2ec..45a357abd6 100644 --- a/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir +++ b/test/TritonGEN/tritongen-2Dblockprefetch-to-llvm.mlir @@ -115,6 +115,20 @@ llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %b // ----- +llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: [[ELEM_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1 + // CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1 + // CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}}) + triton_gen.2Dblockprefetch %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=8, tile_height=16, v_blocks=1, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) + llvm.return +} + +// ----- + llvm.func @triton_gen.2Dblockprefetch(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.mlir.constant(2 : i32) : i32 // CHECK: [[ElemSize:%.*]] = llvm.mlir.constant(2 : i32) : i32 diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 9410cb48a5..30edca0d69 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -197,6 +197,11 @@ static bool isSPVBuiltinAvailable(TritonGEN::Matrix2DBlockPrefetchOp op) { op.getTileWidth() == 8 && op.getVBlocks() == 1) return false; + // intel_sub_group_2d_block_prefetch_16b_16r8x1c + if (op.getElemSizeInBits() == 16 && op.getTileHeight() == 16 && + op.getTileWidth() == 8 && op.getVBlocks() == 1) + return false; + return true; }