diff --git a/test/Conversion/intel/arith_to_llvm.mlir b/test/Conversion/intel/arith_to_llvm.mlir index d2f7d42181..4bb9df30ad 100644 --- a/test/Conversion/intel/arith_to_llvm.mlir +++ b/test/Conversion/intel/arith_to_llvm.mlir @@ -12,7 +12,7 @@ // CHECK-LABEL: llvm.func spir_kernelcc @float_to_bfloat_conversion( // CHECK-SCALAR: %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)> // CHECK-VECTOR: %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) -> tensor<512xbf16, #blocked>{ // CHECK-SCALAR: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32, f32, f32, f32)> // CHECK-SCALAR: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f32, f32, f32, f32)> diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir index 44c0fc563f..985b247a94 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --split-input-file | FileCheck %s -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {passthrough = ["convergent"]} // CHECK-DAG: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} // CHECK-DAG: llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x2cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} @@ -112,7 +112,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // COM: Checks the correct lowering of the A operand load for TF32, i.e. using 4xi32 and vnni=false. -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr) { @@ -144,7 +144,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // COM: Checks the correct lowering of a 16-bit 2D-block-store. -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr) { diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir index dfecfda3f1..72ff792237 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --verify-diagnostics --split-input-file -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -14,7 +14,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // ----- -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // ----- -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/accelerate-matmul-ats.mlir b/test/TritonIntelGPU/accelerate-matmul-ats.mlir index bd31aab76f..65327ab3fe 100644 --- a/test/TritonIntelGPU/accelerate-matmul-ats.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-ats.mlir @@ -1,10 +1,10 @@ // RUN: triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -39,11 +39,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -67,7 +67,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -112,7 +112,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir index 8fb069b2db..6c49636a96 100644 --- a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir @@ -4,7 +4,7 @@ #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -49,7 +49,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -71,11 +71,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -110,11 +110,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir index d155007299..d6f43af96d 100644 --- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir +++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir @@ -11,7 +11,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i64, %arg7: i32, %arg8: i64) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -85,7 +85,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -153,7 +153,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg13: !tt.ptr, %arg14: !tt.ptr) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -233,7 +233,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index f4fbd24b1d..aaab478dbf 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -9,7 +9,7 @@ #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { // CHECK-LABEL: tt.func public @matmul_kernel %c8_i32 = arith.constant 8 : i32 @@ -127,7 +127,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) { // CHECK-LABEL: tt.func public @matmul_kernel %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas> diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir index 28b3c77e46..4892038fa2 100644 --- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir +++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir @@ -9,7 +9,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers %c4_i32 = arith.constant 4 : i32 @@ -81,7 +81,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, @@ -152,7 +152,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers_indivisible %c4_i32 = arith.constant 4 : i32 @@ -214,7 +214,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // COM: Case 3: // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer. -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC"} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { %c31_i32 = arith.constant 31 : i32 %c127_i32 = arith.constant 127 : i32 diff --git a/test/TritonIntelGPU/triton_annotate_module.mlir b/test/TritonIntelGPU/triton_annotate_module.mlir index 346f7db573..2c6758d516 100644 --- a/test/TritonIntelGPU/triton_annotate_module.mlir +++ b/test/TritonIntelGPU/triton_annotate_module.mlir @@ -1,8 +1,8 @@ -// RUN: triton-opt %s --split-input-file -triton-annotate-module='target=xpu:DEVICE_ARCH.PVC threads-per-warp=32' | FileCheck %s +// RUN: triton-opt %s --split-input-file -triton-annotate-module='target=xpu:DEVICE_ARCH.PVC support-sg-2d-block=true support-dpas=true threads-per-warp=32' | FileCheck %s module { // COM: Ensure that the 'threads-per-warp' attribute is set according to the option. - // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 32 : i32} + // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 32 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} tt.func @kernel() { tt.return } @@ -13,7 +13,7 @@ module { module { // COM: Ensure that the 'threads-per-warp' attribute is overwritten when the kernel contains a 'tt.dot' // operation that can be lowered to DPAS instructions. - // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 16 : i32} + // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} tt.func @kernel() { %a = arith.constant dense<1.00e+00> : tensor<128x32xf16> %b = arith.constant dense<2.00e+00> : tensor<32x128xf16> diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 9b747208fc..46e3049e2c 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -98,7 +98,6 @@ def __init__(self, target: tuple) -> None: mod = compile_module_from_src(Path(os.path.join(dirname, "arch_parser.c")).read_text(), "arch_utils") self.parse_device_arch = mod.parse_device_arch self.properties = self.parse_target(target.arch) - self.device_arch = self.properties["device_arch"] self.binary_ext = "spv" def parse_target(self, tgt_prop) -> dict: @@ -106,7 +105,6 @@ def parse_target(self, tgt_prop) -> dict: dev_prop['name'] = tgt_prop.get('name', 'xpu') dev_prop['platform_name'] = tgt_prop.get('platform_name', None) dev_prop['vendor'] = tgt_prop.get('vendor', None) - dev_prop['driver_version'] = tgt_prop.get('driver_version', None) dev_prop['version'] = tgt_prop.get('version', None) dev_prop['gpu_eu_count'] = tgt_prop.get('gpu_eu_count', None) dev_prop['gpu_subslice_count'] = tgt_prop.get('gpu_subslice_count', None) @@ -152,19 +150,20 @@ def make_ttir(mod, metadata, opt): return mod @staticmethod - def make_ttgir(mod, metadata, opt, device_arch): + def make_ttgir(mod, metadata, opt, properties): cluster_info = intel.ClusterInfo() if opt.cluster_dims is not None: cluster_info.clusterDimX = opt.cluster_dims[0] cluster_info.clusterDimY = opt.cluster_dims[1] cluster_info.clusterDimZ = opt.cluster_dims[2] - is_lts = Version(metadata["target"].arch["driver_version"]) == Version("1.3.27642") - # Annotate module with information required by subsequent transformations. pm = ir.pass_manager(mod.context) pm.enable_debug() - intel.passes.ttgpuir.add_triton_annotate_module(pm, f"xpu:{device_arch}", is_lts, opt.threads_per_warp) + device_arch = properties["device_arch"] + intel.passes.ttgpuir.add_triton_annotate_module(pm, f"xpu:{device_arch}", + properties["support_cl_sg_2d_block_io"], + properties["support_cl_sg_matmul_acc"], opt.threads_per_warp) pm.run(mod) # Overwrite the threads_per_warp option with the module annotation. @@ -174,7 +173,8 @@ def make_ttgir(mod, metadata, opt, device_arch): pm = ir.pass_manager(mod.context) pm.enable_debug() - if (not is_lts and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): + if (properties["support_cl_sg_2d_block_io"] and properties["support_cl_sg_matmul_acc"] + and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt) passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas) @@ -250,7 +250,7 @@ def make_spv(src, metadata): def add_stages(self, stages, options): stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) - stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.device_arch) + stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.properties) stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options) stages["spv"] = lambda src, metadata: self.make_spv(src, metadata) diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td index 75079f88b6..4f26265afe 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td @@ -24,9 +24,16 @@ def TritonIntelGPU_Dialect : Dialect { return ::llvm::StringLiteral("triton_gpu.target"); } - /// Get the name of the attribute used to indicate the LTS driver. - static ::llvm::StringLiteral getLTSAttrName() { - return ::llvm::StringLiteral("triton_gpu.is_lts"); + /// Get the name of the attribute used to indicate whether subgroup 2D block + /// operations (e.g., 2D block read/write) are available. + static llvm::StringRef getSupportSG2DBlockAttrName() { + return "triton_intel_gpu.support_sg_2d_block"; + } + + /// Get the name of the attribute used to indicate whether the DPAS + /// instruction is available. + static llvm::StringRef getSupportDPASAttrName() { + return "triton_intel_gpu.support_dpas"; } }]; diff --git a/third_party/intel/include/TritonAnnotateModule/Passes.td b/third_party/intel/include/TritonAnnotateModule/Passes.td index 0bf5a98493..f5df31b4b4 100644 --- a/third_party/intel/include/TritonAnnotateModule/Passes.td +++ b/third_party/intel/include/TritonAnnotateModule/Passes.td @@ -28,8 +28,10 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> { Option<"target", "target", "std::string", /*default*/"\"\"", "the GPU target, e.g., PVC">, - Option<"isLTS", "is-lts", "bool", /*default*/"false", - "whether we are using an LTS driver">, + Option<"supportSG2DBlock", "support-sg-2d-block", "bool", /*default*/"false", + "whether subgroup 2D block operations (e.g., 2D block read/write) are available">, + Option<"supportDPAS", "support-dpas", "bool", /*default*/"false", + "whether DPAS instruction is available">, Option<"threadsPerWarp", "threads-per-warp", "unsigned", /*default*/"32", "number of threads per warp (aka subgroup size)">, diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp index 22f5a95b8a..b844ee3076 100644 --- a/third_party/intel/lib/Analysis/DPAS.cpp +++ b/third_party/intel/lib/Analysis/DPAS.cpp @@ -10,7 +10,8 @@ DPASAnalysis::DPASAnalysis(Operation *root) { mod = root->getParentOfType(); DeviceArch arch = getDeviceArch(mod); - bool isLTS = mod->hasAttr(TritonIntelGPUDialect::getLTSAttrName()); + bool supportDPAS = + mod->hasAttr(TritonIntelGPUDialect::getSupportDPASAttrName()); // Populate the maps. mod.walk([&](FunctionOpInterface funcOp) { @@ -22,9 +23,10 @@ DPASAnalysis::DPASAnalysis(Operation *root) { else funcToDotMap[funcOp] = {dotOp}; - DPASEngineType dpasEngineType = (isLTS || arch == DeviceArch::UNKNOWN) - ? DPASEngineType::NOT_APPLICABLE - : DPASAnalysis::getDPASType(dotOp); + DPASEngineType dpasEngineType = + (!supportDPAS || arch == DeviceArch::UNKNOWN) + ? DPASEngineType::NOT_APPLICABLE + : DPASAnalysis::getDPASType(dotOp); dotToDPASEngineMap[dotOp] = dpasEngineType; // Only PVC supports TF32. diff --git a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp index acb70e882c..93a11fa769 100644 --- a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp +++ b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp @@ -28,10 +28,11 @@ struct TritonAnnotateModule mod->setAttr(intel::TritonIntelGPUDialect::getTargetAttrName(), builder.getStringAttr(target.getValue())); - // FIXME: Use SYCL runtime to query supported OpenCL extensions, instead - // of checking driver version. - if (isLTS) - mod->setAttr(intel::TritonIntelGPUDialect::getLTSAttrName(), + if (supportSG2DBlock) + mod->setAttr(intel::TritonIntelGPUDialect::getSupportSG2DBlockAttrName(), + builder.getUnitAttr()); + if (supportDPAS) + mod->setAttr(intel::TritonIntelGPUDialect::getSupportDPASAttrName(), builder.getUnitAttr()); DPASAnalysis &dpasAnalysis = getAnalysis(); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index b2cd02a2d6..1f0dcc0915 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -181,7 +181,10 @@ class TritonGPUToLLVMPipelineManager { TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx) : mod(mod), ctx(ctx), isAdvancedPathEnabled( - !mod->hasAttr("triton_gpu.is_lts") && + mod->hasAttr(gpu::intel::TritonIntelGPUDialect:: + getSupportSG2DBlockAttrName()) && + mod->hasAttr( + gpu::intel::TritonIntelGPUDialect::getSupportDPASAttrName()) && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) {} /// FIXME: remove once the block ptr conversion path is capable of handling diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index 6257bf1442..d22ad3ec51 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -80,7 +80,10 @@ struct ConvertTritonGPUToLLVM mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); bool isAdvancedPathEnabled = - !mod->hasAttr("triton_gpu.is_lts") && + mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: + getSupportSG2DBlockAttrName()) && + mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: + getSupportDPASAttrName()) && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR"); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, isAdvancedPathEnabled); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index 46580796b1..27694844e0 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -66,7 +66,7 @@ struct IntelGPUPipelinePass void runOnOperation() override { ModuleOp m = getOperation(); - if (m->hasAttr("triton_gpu.is_lts")) + if (!m->hasAttr(ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return; auto deviceArch = ttgi::getDeviceArch(m); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 856650c9a5..2a781589c4 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -795,7 +795,8 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) { bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) { // Disable 2D block store on LTS. - if (storeOp->getParentOfType()->hasAttr("triton_gpu.is_lts")) + if (!storeOp->getParentOfType()->hasAttr( + ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return false; // If storeOp is a pointer to a tensor, we try to find out if the diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp index 7b6e6f3e58..9e7cefdf4e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp @@ -2,6 +2,7 @@ #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "triton/Dialect/Triton/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" @@ -59,7 +60,8 @@ bool isDivisible(Value value, unsigned divisor) { /// - the tensor pointer is not contiguous on memory bool shouldRemove(tt::MakeTensorPtrOp &op, ttgi::DeviceArch deviceArch, bool isUsedByStoreOp) { - if (op->getParentOfType()->hasAttr("triton_gpu.is_lts")) + if (!op->getParentOfType()->hasAttr( + ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return true; // Non-PVC device should always remove the tensor pointer diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 6a0ac47f6c..6aaa1b2221 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -33,10 +33,11 @@ using ret = py::return_value_policy; m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1) { \ pm.addPass(builder({val0, val1})); \ }) -#define ADD_PASS_WRAPPER_OPT_3(name, builder, ty0, ty1, ty2) \ - m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) { \ - pm.addPass(builder({val0, val1, val2})); \ - }) +#define ADD_PASS_WRAPPER_OPT_4(name, builder, ty0, ty1, ty2, ty3) \ + m.def(name, \ + [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, ty3 val3) { \ + pm.addPass(builder({val0, val1, val2, val3})); \ + }) static uint32_t findKernels(llvm::Module &M, std::set &functions) { @@ -83,9 +84,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUDistributeToWarps); ADD_PASS_WRAPPER_0("add_match_target_size", gpu::intel::createTritonIntelGPUMatchTargetSize); - ADD_PASS_WRAPPER_OPT_3("add_triton_annotate_module", + ADD_PASS_WRAPPER_OPT_4("add_triton_annotate_module", gpu::intel::createTritonAnnotateModule, - const std::string &, bool, unsigned); + const std::string &, bool, bool, unsigned); } void init_triton_intel(py::module &&m) {