From bb84ed87e11686fb208a40c6fa244e4365be005c Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 4 Jun 2024 16:43:53 +0100 Subject: [PATCH 1/5] [GEN] Add cache controls decoration to `2d_block_read` builtin calls !1224 converts `triton_gen.2Dblockload` to OCL builtin `2d_block_read` calls. Add cache controls decoration capturing original `cache_control` semantics. Signed-off-by: Victor Perez --- test/TritonGEN/tritongen-to-llvm.mlir | 175 ++++++++++++++++++ .../TritonGENToLLVM/TritonGENToLLVMPass.cpp | 81 ++++++-- 2 files changed, 236 insertions(+), 20 deletions(-) diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir index 1530143f1e..67afe56279 100644 --- a/test/TritonGEN/tritongen-to-llvm.mlir +++ b/test/TritonGEN/tritongen-to-llvm.mlir @@ -290,6 +290,181 @@ llvm.func @triton_gen.dpas.f32(%c : vector<8xf32>, %a : vector<4xf32>, %b : vect // ----- +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Uncached, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Uncached, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Cached, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Cached, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Streaming, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Streaming, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, InvalidateAfterRead, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1IAR_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> + // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> + // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> + // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"]} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () + // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<4xi32> + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { + // CHECK-DAG: [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr to i64 + // CHECK-DAG: [[CST_32:%.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK-DAG: [[CST_8a:%.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: [[CST_8b:%.*]] = llvm.mlir.constant(8 : i32) : i32 + // CHECK-DAG: [[CST_1:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[CST_FALSE_1:%.*]] = llvm.mlir.constant(false) : i1 + // CHECK-DAG: [[CST_FALSE_2:%.*]] = llvm.mlir.constant(false) : i1 + // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-DAG: [[WIDTH:%.*]] = llvm.sub %arg1, [[ONE]] : i32 + // CHECK-DAG: [[HEIGHT:%.*]] = llvm.sub %arg2, [[ONE]] : i32 + // CHECK-DAG: [[PITCH:%.*]] = llvm.sub %arg3, [[ONE]] : i32 + // CHECK-NEXT: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i32([[PTR]], [[WIDTH]], [[HEIGHT]], [[PITCH]], %arg4, %arg5, [[CST_32]], [[CST_8a]], [[CST_8b]], [[CST_1]], [[CST_FALSE_1]], [[CST_FALSE_2]], [[ZERO]]) : (i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<4xi32> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<4xi32> + llvm.return +} + +// ----- + // CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8f32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, vector<8xf32>) llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xf32>) { diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 0dfb043578..74ae02a9bd 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -216,12 +216,60 @@ static bool isOCLBuiltinAvailable(TritonGEN::Matrix2DBlockLoadOp op) { op.getTileWidth() == 32 && op.getVBlocks() == 1) return false; - if (op.getCacheControl() != TritonGEN::LoadCacheControl::DEFAULT) - return false; - return true; } +static SmallVector +loadCacheControlToDecoration(Builder &builder, uint32_t operandNum, + TritonGEN::LoadCacheControl orig) { + const auto build = [&builder, + operandNum](TritonGEN::LoadCacheControlDecorationEnum l1, + TritonGEN::LoadCacheControlDecorationEnum l3) + -> SmallVector { + return {builder.getAttr( + 1, l1, operandNum), + builder.getAttr( + 3, l3, operandNum)}; + }; + switch (orig) { + case TritonGEN::LoadCacheControl::DEFAULT: + return {}; + case TritonGEN::LoadCacheControl::L1UC_L3UC: + return build(TritonGEN::LoadCacheControlDecorationEnum::Uncached, + TritonGEN::LoadCacheControlDecorationEnum::Uncached); + case TritonGEN::LoadCacheControl::L1UC_L3C: + return build(TritonGEN::LoadCacheControlDecorationEnum::Uncached, + TritonGEN::LoadCacheControlDecorationEnum::Cached); + case TritonGEN::LoadCacheControl::L1C_L3UC: + return build(TritonGEN::LoadCacheControlDecorationEnum::Cached, + TritonGEN::LoadCacheControlDecorationEnum::Uncached); + case TritonGEN::LoadCacheControl::L1C_L3C: + return build(TritonGEN::LoadCacheControlDecorationEnum::Cached, + TritonGEN::LoadCacheControlDecorationEnum::Cached); + case TritonGEN::LoadCacheControl::L1S_L3UC: + return build(TritonGEN::LoadCacheControlDecorationEnum::Streaming, + TritonGEN::LoadCacheControlDecorationEnum::Uncached); + case TritonGEN::LoadCacheControl::L1S_L3C: + return build(TritonGEN::LoadCacheControlDecorationEnum::Streaming, + TritonGEN::LoadCacheControlDecorationEnum::Cached); + case TritonGEN::LoadCacheControl::L1IAR_L3C: + return build(TritonGEN::LoadCacheControlDecorationEnum::InvalidateAfterRead, + TritonGEN::LoadCacheControlDecorationEnum::Cached); + } + llvm_unreachable("Unhandled case"); +} + +static std::optional +loadCacheControlToCacheControls(Builder &builder, + TritonGEN::LoadCacheControl orig, + uint32_t operandNum) { + SmallVector decorations = + loadCacheControlToDecoration(builder, operandNum, orig); + if (decorations.empty()) + return {}; + return builder.getAttr(decorations); +} + static Value createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op, ConversionPatternRewriter &rewriter) { MLIRContext *context = rewriter.getContext(); @@ -254,23 +302,16 @@ static Value createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op, SmallVector args{op.getPtr(), op.getBaseWidth(), op.getBaseHeight(), op.getBasePitch(), byteCoord, dest}; - - MLIRContext *ctx = rewriter.getContext(); - intel::AttrBuilder funcAttrBuilder(*ctx); - intel::AttrBuilder param0AttrBuilder(*ctx); - intel::AttrBuilder param5AttrBuilder(*ctx); - funcAttrBuilder.addPassthroughAttribute(llvm::Attribute::NoUnwind); - param0AttrBuilder.addAttribute(llvm::Attribute::NonNull); - param0AttrBuilder.addAttribute(llvm::Attribute::ReadOnly); - param5AttrBuilder.addAttribute(llvm::Attribute::NonNull); - param5AttrBuilder.addAttribute(llvm::Attribute::WriteOnly); - std::vector paramAttrs(argTypes.size()); - paramAttrs[0] = param0AttrBuilder.getAttributes(); - paramAttrs[5] = param5AttrBuilder.getAttributes(); - intel::AttributeList attrs = getAttrList(funcAttrBuilder, paramAttrs); - - createDeviceFunctionCall(rewriter, fnName, void_ty(context), argTypes, args, - attrs); + LLVM::CallOp call = + createDeviceFunctionCall(rewriter, fnName, void_ty(context), argTypes, + args, true /*convergent*/); + constexpr uint32_t ptrOperandIndex = 0; + if (std::optional optCacheControls = + loadCacheControlToCacheControls(rewriter, op.getCacheControl(), + ptrOperandIndex)) { + call->setAttr(TritonGEN::TritonGENDialect::getCacheControlsAttrName(), + *optCacheControls); + } return rewriter.create(loc, resType, dest); } From 08bc10b1f69d3fdab12a18f68cd53b6e5a15bd04 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Wed, 5 Jun 2024 08:44:23 +0100 Subject: [PATCH 2/5] Update cache levels --- third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 74ae02a9bd..7c55af21ff 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -227,9 +227,9 @@ loadCacheControlToDecoration(Builder &builder, uint32_t operandNum, TritonGEN::LoadCacheControlDecorationEnum l3) -> SmallVector { return {builder.getAttr( - 1, l1, operandNum), + 0, l1, operandNum), builder.getAttr( - 3, l3, operandNum)}; + 1, l3, operandNum)}; }; switch (orig) { case TritonGEN::LoadCacheControl::DEFAULT: From 97d46373c6dad0fc0deb82e79b5d0eb02bf0772b Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Mon, 10 Jun 2024 10:20:46 +0100 Subject: [PATCH 3/5] Change variable name --- test/TritonGEN/tritongen-to-llvm.mlir | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir index 67afe56279..be4e770da5 100644 --- a/test/TritonGEN/tritongen-to-llvm.mlir +++ b/test/TritonGEN/tritongen-to-llvm.mlir @@ -313,8 +313,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -332,8 +332,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -351,8 +351,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -370,8 +370,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -389,8 +389,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -408,8 +408,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> @@ -427,8 +427,8 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[SIXTY:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[SIXTY]] x i16 : (i32) -> !llvm.ptr + // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 + // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> From 5755dc4de1c24aca107a11b79736b9dcc9749089 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Mon, 10 Jun 2024 11:00:36 +0100 Subject: [PATCH 4/5] Undo unwanted changes Signed-off-by: Victor Perez --- .../TritonGENToLLVM/TritonGENToLLVMPass.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 7c55af21ff..8d59124c2b 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -302,9 +302,23 @@ static Value createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op, SmallVector args{op.getPtr(), op.getBaseWidth(), op.getBaseHeight(), op.getBasePitch(), byteCoord, dest}; - LLVM::CallOp call = - createDeviceFunctionCall(rewriter, fnName, void_ty(context), argTypes, - args, true /*convergent*/); + + MLIRContext *ctx = rewriter.getContext(); + intel::AttrBuilder funcAttrBuilder(*ctx); + intel::AttrBuilder param0AttrBuilder(*ctx); + intel::AttrBuilder param5AttrBuilder(*ctx); + funcAttrBuilder.addPassthroughAttribute(llvm::Attribute::NoUnwind); + param0AttrBuilder.addAttribute(llvm::Attribute::NonNull); + param0AttrBuilder.addAttribute(llvm::Attribute::ReadOnly); + param5AttrBuilder.addAttribute(llvm::Attribute::NonNull); + param5AttrBuilder.addAttribute(llvm::Attribute::WriteOnly); + std::vector paramAttrs(argTypes.size()); + paramAttrs[0] = param0AttrBuilder.getAttributes(); + paramAttrs[5] = param5AttrBuilder.getAttributes(); + intel::AttributeList attrs = getAttrList(funcAttrBuilder, paramAttrs); + + LLVM::CallOp call = createDeviceFunctionCall( + rewriter, fnName, void_ty(context), argTypes, args, attrs); constexpr uint32_t ptrOperandIndex = 0; if (std::optional optCacheControls = loadCacheControlToCacheControls(rewriter, op.getCacheControl(), From 76bf92a971a28f63569c4643b1652fad62ea351d Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 11 Jun 2024 08:45:24 +0100 Subject: [PATCH 5/5] Move tests --- .../tritongen-2Dblockload-to-llvm.mlir | 80 ++++++++ test/TritonGEN/tritongen-to-llvm.mlir | 175 ------------------ 2 files changed, 80 insertions(+), 175 deletions(-) diff --git a/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir b/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir index 5b855218b4..1a0a727492 100644 --- a/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir +++ b/test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir @@ -269,3 +269,83 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_ %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=1, transpose=true, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> llvm.return } + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Uncached, 0>, #triton_gen.load_cache_control<1, Uncached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Uncached, 0>, #triton_gen.load_cache_control<1, Cached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Cached, 0>, #triton_gen.load_cache_control<1, Uncached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Cached, 0>, #triton_gen.load_cache_control<1, Cached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Streaming, 0>, #triton_gen.load_cache_control<1, Uncached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, Streaming, 0>, #triton_gen.load_cache_control<1, Cached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-SAME: triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<0, InvalidateAfterRead, 0>, #triton_gen.load_cache_control<1, Cached, 0>> + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1IAR_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} + +// ----- + +llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { + // CHECK: llvm.func @triton_gen.2Dblockload( + // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( + // CHECK-NOT: triton_gen.DecorationCacheControlINTEL + %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + llvm.return +} diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir index be4e770da5..1530143f1e 100644 --- a/test/TritonGEN/tritongen-to-llvm.mlir +++ b/test/TritonGEN/tritongen-to-llvm.mlir @@ -290,181 +290,6 @@ llvm.func @triton_gen.dpas.f32(%c : vector<8xf32>, %a : vector<4xf32>, %b : vect // ----- -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Uncached, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Uncached, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1UC_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Cached, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Cached, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1C_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Streaming, 0>, #triton_gen.load_cache_control<3, Uncached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, Streaming, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1S_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"], triton_gen.DecorationCacheControlINTEL = #triton_gen.decoration_cache_control<#triton_gen.load_cache_control<1, InvalidateAfterRead, 0>, #triton_gen.load_cache_control<3, Cached, 0>>} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=L1IAR_L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) attributes {passthrough = ["convergent"]} - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr<1>, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK: [[C16:%.*]] = llvm.mlir.constant(16 : i32) : i32 - // CHECK-NEXT: [[DEST:%.*]] = llvm.alloca [[C16]] x i16 : (i32) -> !llvm.ptr - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[UNDEF:%.*]] = llvm.mlir.undef : vector<2xi32> - // CHECK-NEXT: [[COORD0:%.*]] = llvm.insertelement %arg4, [[UNDEF]][[[ZERO]] : i32] : vector<2xi32> - // CHECK-NEXT: [[COORD1:%.*]] = llvm.insertelement %arg5, [[COORD0]][[[ONE]] : i32] : vector<2xi32> - // CHECK-NEXT: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[DEST]]) {passthrough = ["convergent"]} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - // CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> - llvm.return -} - -// ----- - -// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<4xi32> - -llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) { - // CHECK: llvm.func @triton_gen.2Dblockload(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32) { - // CHECK-DAG: [[PTR:%.*]] = llvm.ptrtoint %arg0 : !llvm.ptr to i64 - // CHECK-DAG: [[CST_32:%.*]] = llvm.mlir.constant(32 : i32) : i32 - // CHECK-DAG: [[CST_8a:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[CST_8b:%.*]] = llvm.mlir.constant(8 : i32) : i32 - // CHECK-DAG: [[CST_1:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[CST_FALSE_1:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK-DAG: [[CST_FALSE_2:%.*]] = llvm.mlir.constant(false) : i1 - // CHECK-DAG: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-DAG: [[ONE:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK-DAG: [[WIDTH:%.*]] = llvm.sub %arg1, [[ONE]] : i32 - // CHECK-DAG: [[HEIGHT:%.*]] = llvm.sub %arg2, [[ONE]] : i32 - // CHECK-DAG: [[PITCH:%.*]] = llvm.sub %arg3, [[ONE]] : i32 - // CHECK-NEXT: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v4i32([[PTR]], [[WIDTH]], [[HEIGHT]], [[PITCH]], %arg4, %arg5, [[CST_32]], [[CST_8a]], [[CST_8b]], [[CST_1]], [[CST_FALSE_1]], [[CST_FALSE_2]], [[ZERO]]) : (i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<4xi32> - %0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<4xi32> - llvm.return -} - -// ----- - // CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockWrite.v8f32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32, vector<8xf32>) llvm.func @triton_gen.2Dblockstore(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32, %stored_val : vector<8xf32>) {