From e66c7c441185429a2a7ef09a49eb5f672b3896d6 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Thu, 4 Jul 2024 15:49:26 -0700 Subject: [PATCH 1/5] Check if driver support dpas and 2d block operations Signed-off-by: Whitney Tsang --- test/Conversion/intel/arith_to_llvm.mlir | 2 +- .../intel/tritongpu_to_llvm_intel_block_ptr.mlir | 6 +++--- .../tritongpu_to_llvm_intel_block_ptr_invalid.mlir | 6 +++--- test/TritonIntelGPU/accelerate-matmul-ats.mlir | 12 ++++++------ test/TritonIntelGPU/accelerate-matmul-pvc.mlir | 12 ++++++------ .../backward_combine_dpas_dot_layout.mlir | 8 ++++---- test/TritonIntelGPU/loop-pipeline.mlir | 4 ++-- test/TritonIntelGPU/rewrite-tensor-pointer.mlir | 8 ++++---- third_party/intel/backend/compiler.py | 10 ++++++---- .../include/TritonIntelGPUToLLVM/TypeConverter.h | 2 +- .../intel/lib/TritonIntelGPUToLLVM/PipelineManager.h | 3 ++- .../lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp | 9 ++++++--- .../intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp | 5 ++--- .../Pipeliner/SoftwarePipeliner.cpp | 2 +- .../RemoveLayoutConversions.cpp | 3 ++- .../RewriteTensorPointer.cpp | 3 ++- .../intel/lib/TritonIntelGPUTransforms/Utility.cpp | 2 +- third_party/intel/triton_xpu.cc | 12 +++++++++--- 18 files changed, 61 insertions(+), 48 deletions(-) diff --git a/test/Conversion/intel/arith_to_llvm.mlir b/test/Conversion/intel/arith_to_llvm.mlir index d2f7d42181..0d7ca7ab5d 100644 --- a/test/Conversion/intel/arith_to_llvm.mlir +++ b/test/Conversion/intel/arith_to_llvm.mlir @@ -12,7 +12,7 @@ // CHECK-LABEL: llvm.func spir_kernelcc @float_to_bfloat_conversion( // CHECK-SCALAR: %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)> // CHECK-VECTOR: %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) -> tensor<512xbf16, #blocked>{ // CHECK-SCALAR: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32, f32, f32, f32)> // CHECK-SCALAR: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f32, f32, f32, f32)> diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir index 44c0fc563f..d5ff8bb226 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --split-input-file | FileCheck %s -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {passthrough = ["convergent"]} // CHECK-DAG: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} // CHECK-DAG: llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x2cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} @@ -112,7 +112,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // COM: Checks the correct lowering of the A operand load for TF32, i.e. using 4xi32 and vnni=false. -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr) { @@ -144,7 +144,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // COM: Checks the correct lowering of a 16-bit 2D-block-store. -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr) { diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir index dfecfda3f1..0c9ee8e5ea 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --verify-diagnostics --split-input-file -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -14,7 +14,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // ----- -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa // ----- -module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/accelerate-matmul-ats.mlir b/test/TritonIntelGPU/accelerate-matmul-ats.mlir index bd31aab76f..fc08ce2836 100644 --- a/test/TritonIntelGPU/accelerate-matmul-ats.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-ats.mlir @@ -1,10 +1,10 @@ // RUN: triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -39,11 +39,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -67,7 +67,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -112,7 +112,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir index 8fb069b2db..5685e5a8f6 100644 --- a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir @@ -4,7 +4,7 @@ #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -49,7 +49,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -71,11 +71,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -110,11 +110,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // ----- -// CHECK-NOT: dpas +// CHECK-NOT: triton_intel_gpu.dpas #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir index d155007299..4ff6e762ca 100644 --- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir +++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir @@ -11,7 +11,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i64, %arg7: i32, %arg8: i64) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -85,7 +85,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -153,7 +153,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg13: !tt.ptr, %arg14: !tt.ptr) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -233,7 +233,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index f4fbd24b1d..11ad7c2ec0 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -9,7 +9,7 @@ #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { // CHECK-LABEL: tt.func public @matmul_kernel %c8_i32 = arith.constant 8 : i32 @@ -127,7 +127,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) { // CHECK-LABEL: tt.func public @matmul_kernel %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas> diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir index 28b3c77e46..e5ad70598e 100644 --- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir +++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir @@ -9,7 +9,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers %c4_i32 = arith.constant 4 : i32 @@ -81,7 +81,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, @@ -152,7 +152,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers_indivisible %c4_i32 = arith.constant 4 : i32 @@ -214,7 +214,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // COM: Case 3: // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer. -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC"} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.support_sg_2d_block" = 1 : i1} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { %c31_i32 = arith.constant 31 : i32 %c127_i32 = arith.constant 127 : i32 diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 373882b97c..7dab9e9162 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -153,16 +153,16 @@ def make_ttir(mod, metadata, opt): return mod @staticmethod - def make_ttgir(mod, metadata, opt, device_arch): + def make_ttgir(mod, metadata, opt, device_arch, support_sg_2d_block, support_dpas): is_lts = Version(metadata["target"].arch["driver_version"]) == Version("1.3.27642") - if (not is_lts and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): + intel.set_device_properties(mod, is_lts, support_sg_2d_block, support_dpas) + if (support_sg_2d_block and support_dpas and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): return XPUBackend.Experimental.make_ttgir(mod, metadata, opt) # TTIR -> TTGIR pm = ir.pass_manager(mod.context) pm.enable_debug() passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas) - intel.set_device_properties(mod, is_lts) # optimize TTGIR intel.passes.ttgpuir.add_accelerate_matmul(pm) @@ -236,7 +236,9 @@ def make_spv(src, metadata): def add_stages(self, stages, options): stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) - stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.device_arch) + stages["ttgir"] = lambda src, metadata: self.make_ttgir( + src, metadata, options, self.device_arch, self.properties["support_cl_sg_2d_block_io"], self.properties[ + "support_cl_sg_matmul_acc"]) stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options) stages["spv"] = lambda src, metadata: self.make_spv(src, metadata) diff --git a/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h b/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h index 92df7a3f3e..8cd4c5b2c6 100644 --- a/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h +++ b/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h @@ -18,7 +18,7 @@ class TritonIntelGPUToLLVMTypeConverter : public TritonGPUToLLVMTypeConverter { using TypeConverter::convertType; TritonIntelGPUToLLVMTypeConverter( - MLIRContext *ctx, LowerToLLVMOptions &option, bool isLTSDriver, + MLIRContext *ctx, LowerToLLVMOptions &option, bool isBlockPtrPathEnabled, const DataLayoutAnalysis *analysis = nullptr); }; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 9b34ed6fdb..ed61a20327 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -181,7 +181,8 @@ class TritonGPUToLLVMPipelineManager { TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx) : mod(mod), ctx(ctx), blockPtrPathIsEnabled( - !mod->hasAttr("triton_gpu.is_lts") && + mod->hasAttr("triton_gpu.support_sg_2d_block") && + mod->hasAttr("triton_gpu.support_dpas") && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) {} /// FIXME: remove once the block ptr conversion path is capable of handling diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index 31cad6365a..79acba3225 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -79,9 +79,12 @@ struct ConvertTritonGPUToLLVM intel::TritonGPUToLLVMPipelineManager pipelineManager(mod, context); mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); - bool isLTSDriver = mod->hasAttr("triton_gpu.is_lts"); + bool isBlockPtrPathEnabled = + mod->hasAttr("triton_gpu.support_sg_2d_block") && + mod->hasAttr("triton_gpu.support_dpas") && + mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR"); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, - isLTSDriver); + isBlockPtrPathEnabled); TritonLLVMConversionTarget convTarget(*context); int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod); @@ -98,7 +101,7 @@ struct ConvertTritonGPUToLLVM { mlir::LowerToLLVMOptions option(context); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, - isLTSDriver); + isBlockPtrPathEnabled); TritonLLVMFunctionConversionTarget funcTarget(*context); RewritePatternSet funcPatterns(context); pipelineManager.populateFunctionConversionPatterns( diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp index 68bc0aac77..e11e3579d4 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp @@ -10,13 +10,12 @@ #include "triton/Tools/Sys/GetEnv.hpp" TritonIntelGPUToLLVMTypeConverter::TritonIntelGPUToLLVMTypeConverter( - MLIRContext *ctx, LowerToLLVMOptions &option, bool isLTSDriver, + MLIRContext *ctx, LowerToLLVMOptions &option, bool isBlockPtrPathEnabled, const DataLayoutAnalysis *analysis) : TritonGPUToLLVMTypeConverter(ctx, option, analysis) { // Augment/overwrite type conversions required for the Intel conversion // passes. - if (!isLTSDriver && - mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) { + if (isBlockPtrPathEnabled) { // tt::pointer to v2i32. addConversion([&](PointerType type) -> std::optional { if (isa(type.getPointeeType())) { diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index 46580796b1..e645cad113 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -66,7 +66,7 @@ struct IntelGPUPipelinePass void runOnOperation() override { ModuleOp m = getOperation(); - if (m->hasAttr("triton_gpu.is_lts")) + if (!m->hasAttr("triton_gpu.support_sg_2d_block")) return; auto deviceArch = ttgi::getDeviceArch(m); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 0251ef3e0e..a49d4b1ea9 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -795,7 +795,8 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) { bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) { // Disable 2D block store on LTS. - if (storeOp->getParentOfType()->hasAttr("triton_gpu.is_lts")) + if (!storeOp->getParentOfType()->hasAttr( + "triton_gpu.support_sg_2d_block")) return false; // If storeOp is a pointer to a tensor, we try to find out if the diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp index 7b6e6f3e58..27d5c1b0ac 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp @@ -59,7 +59,8 @@ bool isDivisible(Value value, unsigned divisor) { /// - the tensor pointer is not contiguous on memory bool shouldRemove(tt::MakeTensorPtrOp &op, ttgi::DeviceArch deviceArch, bool isUsedByStoreOp) { - if (op->getParentOfType()->hasAttr("triton_gpu.is_lts")) + if (!op->getParentOfType()->hasAttr( + "triton_gpu.support_sg_2d_block")) return true; // Non-PVC device should always remove the tensor pointer diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp index 30c425eb73..90866cee70 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp @@ -25,7 +25,7 @@ namespace ttgi = mlir::triton::gpu::intel; namespace mlir::triton::gpu::intel { bool supportDPAS(DotOp op, DeviceArch arch) { - if (op->getParentOfType()->hasAttr("triton_gpu.is_lts")) + if (!op->getParentOfType()->hasAttr("triton_gpu.support_dpas")) return false; if (arch == DeviceArch::UNKNOWN) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 655c48adf3..5a4c558330 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -94,12 +94,18 @@ void init_triton_intel(py::module &&m) { context.loadAllAvailableDialects(); }); - // FIXME: Use SYCL runtime to query supported OpenCL extensions, instead of - // checking driver version. - m.def("set_device_properties", [](mlir::ModuleOp mod, bool isLTS) { + m.def("set_device_properties", [](mlir::ModuleOp mod, bool isLTS, + bool supportSG2DBlock, bool supportDPAS) { auto i1_ty = mlir::IntegerType::get(mod->getContext(), 1); + // FIXME: Use SYCL runtime to query supported OpenCL extensions, instead of + // checking driver version. if (isLTS) mod->setAttr("triton_gpu.is_lts", mlir::IntegerAttr::get(i1_ty, 1)); + if (supportSG2DBlock) + mod->setAttr("triton_gpu.support_sg_2d_block", + mlir::IntegerAttr::get(i1_ty, 1)); + if (supportDPAS) + mod->setAttr("triton_gpu.support_dpas", mlir::IntegerAttr::get(i1_ty, 1)); }); m.def("set_spv_target_triple", [](llvm::Module *mod) { From 5db5c26157bc92a46ce3cc5530a212fd361d1d35 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Fri, 5 Jul 2024 15:51:37 +0000 Subject: [PATCH 2/5] address review comments Signed-off-by: Whitney Tsang --- test/Conversion/intel/arith_to_llvm.mlir | 2 +- .../intel/tritongpu_to_llvm_intel_block_ptr.mlir | 6 +++--- .../intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir | 6 +++--- test/TritonIntelGPU/accelerate-matmul-ats.mlir | 8 ++++---- test/TritonIntelGPU/accelerate-matmul-pvc.mlir | 8 ++++---- .../TritonIntelGPU/backward_combine_dpas_dot_layout.mlir | 8 ++++---- test/TritonIntelGPU/loop-pipeline.mlir | 4 ++-- test/TritonIntelGPU/rewrite-tensor-pointer.mlir | 8 ++++---- .../intel/lib/TritonIntelGPUToLLVM/PipelineManager.h | 4 ++-- .../intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp | 2 +- .../intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp | 4 ++-- .../Pipeliner/SoftwarePipeliner.cpp | 2 +- .../TritonIntelGPUTransforms/RemoveLayoutConversions.cpp | 6 +++--- .../TritonIntelGPUTransforms/RewriteTensorPointer.cpp | 2 +- .../intel/lib/TritonIntelGPUTransforms/Utility.cpp | 3 ++- third_party/intel/triton_xpu.cc | 9 ++++----- 16 files changed, 41 insertions(+), 41 deletions(-) diff --git a/test/Conversion/intel/arith_to_llvm.mlir b/test/Conversion/intel/arith_to_llvm.mlir index 0d7ca7ab5d..4bb9df30ad 100644 --- a/test/Conversion/intel/arith_to_llvm.mlir +++ b/test/Conversion/intel/arith_to_llvm.mlir @@ -12,7 +12,7 @@ // CHECK-LABEL: llvm.func spir_kernelcc @float_to_bfloat_conversion( // CHECK-SCALAR: %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)> // CHECK-VECTOR: %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16> -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) -> tensor<512xbf16, #blocked>{ // CHECK-SCALAR: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32, f32, f32, f32)> // CHECK-SCALAR: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f32, f32, f32, f32)> diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir index d5ff8bb226..985b247a94 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --split-input-file | FileCheck %s -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} { // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {passthrough = ["convergent"]} // CHECK-DAG: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} // CHECK-DAG: llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x2cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]} @@ -112,7 +112,7 @@ module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.suppor // COM: Checks the correct lowering of the A operand load for TF32, i.e. using 4xi32 and vnni=false. -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr) { @@ -144,7 +144,7 @@ module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.suppor // COM: Checks the correct lowering of a 16-bit 2D-block-store. -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu( // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr) { diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir index 0c9ee8e5ea..72ff792237 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir @@ -1,6 +1,6 @@ // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --verify-diagnostics --split-input-file -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -14,7 +14,7 @@ module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.suppor // ----- -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 @@ -28,7 +28,7 @@ module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.suppor // ----- -module attributes {"triton_gpu.support_sg_2d_block" = 1 : i1, "triton_gpu.support_dpas" = 1 : i1, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: i64, %arg2: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/accelerate-matmul-ats.mlir b/test/TritonIntelGPU/accelerate-matmul-ats.mlir index fc08ce2836..65327ab3fe 100644 --- a/test/TritonIntelGPU/accelerate-matmul-ats.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-ats.mlir @@ -4,7 +4,7 @@ #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -43,7 +43,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -67,7 +67,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -112,7 +112,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir index 5685e5a8f6..6c49636a96 100644 --- a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir +++ b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir @@ -4,7 +4,7 @@ #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop tt.func public @mma_chain_loop( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -49,7 +49,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -75,7 +75,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: mma_chain_loop_ats tt.func public @mma_chain_loop_ats( %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, @@ -114,7 +114,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}> #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_gpu.support_dpas" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} { // CHECK: chained_dot tt.func public @chained_dot( %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>, diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir index 4ff6e762ca..d6f43af96d 100644 --- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir +++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir @@ -11,7 +11,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i64, %arg7: i32, %arg8: i64) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -85,7 +85,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -153,7 +153,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg13: !tt.ptr, %arg14: !tt.ptr) { %c8_i32 = arith.constant 8 : i32 %c64_i32 = arith.constant 64 : i32 @@ -233,7 +233,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { %c1_i64 = arith.constant 1 : i64 %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir index 11ad7c2ec0..aaab478dbf 100644 --- a/test/TritonIntelGPU/loop-pipeline.mlir +++ b/test/TritonIntelGPU/loop-pipeline.mlir @@ -9,7 +9,7 @@ #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { // CHECK-LABEL: tt.func public @matmul_kernel %c8_i32 = arith.constant 8 : i32 @@ -127,7 +127,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) { // CHECK-LABEL: tt.func public @matmul_kernel %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas> diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir index e5ad70598e..4892038fa2 100644 --- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir +++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir @@ -9,7 +9,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers %c4_i32 = arith.constant 4 : i32 @@ -81,7 +81,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, @@ -152,7 +152,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}> #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}> #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}> -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) { // CHECK: @matmul_kernel_with_block_pointers_indivisible %c4_i32 = arith.constant 4 : i32 @@ -214,7 +214,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num- // COM: Case 3: // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer. -module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.support_sg_2d_block" = 1 : i1} { +module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_intel_gpu.support_sg_2d_block"} { tt.func public @matmul_kernel(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { %c31_i32 = arith.constant 31 : i32 %c127_i32 = arith.constant 127 : i32 diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index ed61a20327..95b64dae3d 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -181,8 +181,8 @@ class TritonGPUToLLVMPipelineManager { TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx) : mod(mod), ctx(ctx), blockPtrPathIsEnabled( - mod->hasAttr("triton_gpu.support_sg_2d_block") && - mod->hasAttr("triton_gpu.support_dpas") && + mod->hasAttr("triton_intel_gpu.support_sg_2d_block") && + mod->hasAttr("triton_intel_gpu.support_dpas") && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) {} /// FIXME: remove once the block ptr conversion path is capable of handling diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp index 0c294e2503..e425f9dcff 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp @@ -89,7 +89,7 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc, unsigned numLaneToReduce, unsigned interleave) const { const bool isLTS = - op->getParentOfType()->hasAttr("triton_gpu.is_lts"); + op->getParentOfType()->hasAttr("triton_intel_gpu.is_lts"); if (isLTS) return false; // No horizontal reduce required. diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index 79acba3225..ba4d719a1a 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -80,8 +80,8 @@ struct ConvertTritonGPUToLLVM mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); bool isBlockPtrPathEnabled = - mod->hasAttr("triton_gpu.support_sg_2d_block") && - mod->hasAttr("triton_gpu.support_dpas") && + mod->hasAttr("triton_intel_gpu.support_sg_2d_block") && + mod->hasAttr("triton_intel_gpu.support_dpas") && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR"); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, isBlockPtrPathEnabled); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index e645cad113..67a74c5ee2 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -66,7 +66,7 @@ struct IntelGPUPipelinePass void runOnOperation() override { ModuleOp m = getOperation(); - if (!m->hasAttr("triton_gpu.support_sg_2d_block")) + if (!m->hasAttr("triton_intel_gpu.support_sg_2d_block")) return; auto deviceArch = ttgi::getDeviceArch(m); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index a49d4b1ea9..063d43f1c6 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -796,7 +796,7 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) { bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) { // Disable 2D block store on LTS. if (!storeOp->getParentOfType()->hasAttr( - "triton_gpu.support_sg_2d_block")) + "triton_intel_gpu.support_sg_2d_block")) return false; // If storeOp is a pointer to a tensor, we try to find out if the @@ -979,8 +979,8 @@ void LayoutRematerialization::rewriteSlice(SetVector &slice, SetVector opsToRewrite; // Keep track of yield operands that need to be duplicated. DenseMap> yieldOperandsMap; - bool isLTS = - convertOp->getParentOfType()->hasAttr("triton_gpu.is_lts"); + bool isLTS = convertOp->getParentOfType()->hasAttr( + "triton_intel_gpu.is_lts"); for (Value v : slice) { auto layoutIt = layout.find(v); assert(layoutIt != layout.end()); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp index 27d5c1b0ac..40c1dd6ced 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp @@ -60,7 +60,7 @@ bool isDivisible(Value value, unsigned divisor) { bool shouldRemove(tt::MakeTensorPtrOp &op, ttgi::DeviceArch deviceArch, bool isUsedByStoreOp) { if (!op->getParentOfType()->hasAttr( - "triton_gpu.support_sg_2d_block")) + "triton_intel_gpu.support_sg_2d_block")) return true; // Non-PVC device should always remove the tensor pointer diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp index 90866cee70..a560077399 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp @@ -25,7 +25,8 @@ namespace ttgi = mlir::triton::gpu::intel; namespace mlir::triton::gpu::intel { bool supportDPAS(DotOp op, DeviceArch arch) { - if (!op->getParentOfType()->hasAttr("triton_gpu.support_dpas")) + if (!op->getParentOfType()->hasAttr( + "triton_intel_gpu.support_dpas")) return false; if (arch == DeviceArch::UNKNOWN) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 5a4c558330..b9009a224a 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -96,16 +96,15 @@ void init_triton_intel(py::module &&m) { m.def("set_device_properties", [](mlir::ModuleOp mod, bool isLTS, bool supportSG2DBlock, bool supportDPAS) { - auto i1_ty = mlir::IntegerType::get(mod->getContext(), 1); + mlir::Builder b(mod); // FIXME: Use SYCL runtime to query supported OpenCL extensions, instead of // checking driver version. if (isLTS) - mod->setAttr("triton_gpu.is_lts", mlir::IntegerAttr::get(i1_ty, 1)); + mod->setAttr("triton_intel_gpu.is_lts", b.getUnitAttr()); if (supportSG2DBlock) - mod->setAttr("triton_gpu.support_sg_2d_block", - mlir::IntegerAttr::get(i1_ty, 1)); + mod->setAttr("triton_intel_gpu.support_sg_2d_block", b.getUnitAttr()); if (supportDPAS) - mod->setAttr("triton_gpu.support_dpas", mlir::IntegerAttr::get(i1_ty, 1)); + mod->setAttr("triton_intel_gpu.support_dpas", b.getUnitAttr()); }); m.def("set_spv_target_triple", [](llvm::Module *mod) { From b0aab30e8345c0988f018ddcc85337679b5b3137 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sat, 6 Jul 2024 02:55:49 +0000 Subject: [PATCH 3/5] address review comment Signed-off-by: Whitney Tsang --- .../Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td | 2 ++ .../intel/lib/TritonIntelGPUToLLVM/PipelineManager.h | 6 ++++-- .../intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp | 6 ++++-- .../Pipeliner/SoftwarePipeliner.cpp | 2 +- .../TritonIntelGPUTransforms/RemoveLayoutConversions.cpp | 2 +- .../lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp | 3 ++- .../intel/lib/TritonIntelGPUTransforms/Utility.cpp | 3 ++- third_party/intel/triton_xpu.cc | 8 ++++++-- 8 files changed, 22 insertions(+), 10 deletions(-) diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td index caaa82a3a8..5bdf6249fa 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td @@ -19,6 +19,8 @@ def TritonIntelGPU_Dialect : Dialect { ]; let extraClassDeclaration = [{ + static std::string getSupportSG2DBlockAttrName() { return "triton_intel_gpu.support_sg_2d_block"; } + static std::string getSupportDPASAttrName() { return "triton_intel_gpu.support_dpas"; } }]; let useDefaultAttributePrinterParser = 1; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 95b64dae3d..59b004d1e7 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -181,8 +181,10 @@ class TritonGPUToLLVMPipelineManager { TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx) : mod(mod), ctx(ctx), blockPtrPathIsEnabled( - mod->hasAttr("triton_intel_gpu.support_sg_2d_block") && - mod->hasAttr("triton_intel_gpu.support_dpas") && + mod->hasAttr(gpu::intel::TritonIntelGPUDialect:: + getSupportSG2DBlockAttrName()) && + mod->hasAttr( + gpu::intel::TritonIntelGPUDialect::getSupportDPASAttrName()) && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) {} /// FIXME: remove once the block ptr conversion path is capable of handling diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index ba4d719a1a..a7658f08e3 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -80,8 +80,10 @@ struct ConvertTritonGPUToLLVM mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); bool isBlockPtrPathEnabled = - mod->hasAttr("triton_intel_gpu.support_sg_2d_block") && - mod->hasAttr("triton_intel_gpu.support_dpas") && + mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: + getSupportSG2DBlockAttrName()) && + mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: + getSupportDPASAttrName()) && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR"); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, isBlockPtrPathEnabled); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index 67a74c5ee2..27694844e0 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -66,7 +66,7 @@ struct IntelGPUPipelinePass void runOnOperation() override { ModuleOp m = getOperation(); - if (!m->hasAttr("triton_intel_gpu.support_sg_2d_block")) + if (!m->hasAttr(ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return; auto deviceArch = ttgi::getDeviceArch(m); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 063d43f1c6..aeb7cd511d 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -796,7 +796,7 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) { bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) { // Disable 2D block store on LTS. if (!storeOp->getParentOfType()->hasAttr( - "triton_intel_gpu.support_sg_2d_block")) + ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return false; // If storeOp is a pointer to a tensor, we try to find out if the diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp index 40c1dd6ced..9e7cefdf4e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp @@ -2,6 +2,7 @@ #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "triton/Dialect/Triton/IR/Dialect.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" @@ -60,7 +61,7 @@ bool isDivisible(Value value, unsigned divisor) { bool shouldRemove(tt::MakeTensorPtrOp &op, ttgi::DeviceArch deviceArch, bool isUsedByStoreOp) { if (!op->getParentOfType()->hasAttr( - "triton_intel_gpu.support_sg_2d_block")) + ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName())) return true; // Non-PVC device should always remove the tensor pointer diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp index a560077399..2f3b8a1c9f 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp @@ -11,6 +11,7 @@ #include "mlir/Transforms/DialectConversion.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Attributes.h" +#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h" #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" @@ -26,7 +27,7 @@ namespace mlir::triton::gpu::intel { bool supportDPAS(DotOp op, DeviceArch arch) { if (!op->getParentOfType()->hasAttr( - "triton_intel_gpu.support_dpas")) + TritonIntelGPUDialect::getSupportDPASAttrName())) return false; if (arch == DeviceArch::UNKNOWN) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index b9009a224a..60665cfd29 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -102,9 +102,13 @@ void init_triton_intel(py::module &&m) { if (isLTS) mod->setAttr("triton_intel_gpu.is_lts", b.getUnitAttr()); if (supportSG2DBlock) - mod->setAttr("triton_intel_gpu.support_sg_2d_block", b.getUnitAttr()); + mod->setAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect:: + getSupportSG2DBlockAttrName(), + b.getUnitAttr()); if (supportDPAS) - mod->setAttr("triton_intel_gpu.support_dpas", b.getUnitAttr()); + mod->setAttr(mlir::triton::gpu::intel::TritonIntelGPUDialect:: + getSupportDPASAttrName(), + b.getUnitAttr()); }); m.def("set_spv_target_triple", [](llvm::Module *mod) { From 6cff5f6d3a27f029e264d420f1ddbe9d829d15fa Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Sun, 7 Jul 2024 20:13:16 +0000 Subject: [PATCH 4/5] address review comment Signed-off-by: Whitney Tsang --- third_party/intel/backend/compiler.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 7dab9e9162..e38498ef8e 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -98,7 +98,6 @@ def __init__(self, target: tuple) -> None: mod = compile_module_from_src(Path(os.path.join(dirname, "arch_parser.c")).read_text(), "arch_utils") self.parse_device_arch = mod.parse_device_arch self.properties = self.parse_target(target.arch) - self.device_arch = self.properties["device_arch"] self.binary_ext = "spv" def parse_target(self, tgt_prop) -> dict: @@ -153,15 +152,18 @@ def make_ttir(mod, metadata, opt): return mod @staticmethod - def make_ttgir(mod, metadata, opt, device_arch, support_sg_2d_block, support_dpas): + def make_ttgir(mod, metadata, opt, properties): is_lts = Version(metadata["target"].arch["driver_version"]) == Version("1.3.27642") - intel.set_device_properties(mod, is_lts, support_sg_2d_block, support_dpas) - if (support_sg_2d_block and support_dpas and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): + intel.set_device_properties(mod, is_lts, properties["support_cl_sg_2d_block_io"], + properties["support_cl_sg_matmul_acc"]) + if (properties["support_cl_sg_2d_block_io"] and properties["support_cl_sg_matmul_acc"] + and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"): return XPUBackend.Experimental.make_ttgir(mod, metadata, opt) # TTIR -> TTGIR pm = ir.pass_manager(mod.context) pm.enable_debug() + device_arch = properties["device_arch"] passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas) # optimize TTGIR @@ -236,9 +238,7 @@ def make_spv(src, metadata): def add_stages(self, stages, options): stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options) - stages["ttgir"] = lambda src, metadata: self.make_ttgir( - src, metadata, options, self.device_arch, self.properties["support_cl_sg_2d_block_io"], self.properties[ - "support_cl_sg_matmul_acc"]) + stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.properties) stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options) stages["spv"] = lambda src, metadata: self.make_spv(src, metadata) From 7aab3de5d36271eec9f43da3f31a192d35d52bb0 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Mon, 8 Jul 2024 20:27:55 +0000 Subject: [PATCH 5/5] address review comments Signed-off-by: Whitney Tsang --- .../Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td | 4 ++-- .../intel/include/TritonIntelGPUToLLVM/TypeConverter.h | 3 ++- .../intel/lib/TritonIntelGPUToLLVM/PipelineManager.h | 10 +++++----- .../intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp | 8 ++++---- .../intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp | 6 +++--- 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td index 5bdf6249fa..0ffdfb81ed 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td @@ -19,8 +19,8 @@ def TritonIntelGPU_Dialect : Dialect { ]; let extraClassDeclaration = [{ - static std::string getSupportSG2DBlockAttrName() { return "triton_intel_gpu.support_sg_2d_block"; } - static std::string getSupportDPASAttrName() { return "triton_intel_gpu.support_dpas"; } + static llvm::StringRef getSupportSG2DBlockAttrName() { return "triton_intel_gpu.support_sg_2d_block"; } + static llvm::StringRef getSupportDPASAttrName() { return "triton_intel_gpu.support_dpas"; } }]; let useDefaultAttributePrinterParser = 1; diff --git a/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h b/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h index 8cd4c5b2c6..a19dfd0671 100644 --- a/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h +++ b/third_party/intel/include/TritonIntelGPUToLLVM/TypeConverter.h @@ -18,7 +18,8 @@ class TritonIntelGPUToLLVMTypeConverter : public TritonGPUToLLVMTypeConverter { using TypeConverter::convertType; TritonIntelGPUToLLVMTypeConverter( - MLIRContext *ctx, LowerToLLVMOptions &option, bool isBlockPtrPathEnabled, + MLIRContext *ctx, LowerToLLVMOptions &option, + bool isExperimentalPathEnabled, const DataLayoutAnalysis *analysis = nullptr); }; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 59b004d1e7..df7f4b1054 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -180,7 +180,7 @@ class TritonGPUToLLVMPipelineManager { public: TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx) : mod(mod), ctx(ctx), - blockPtrPathIsEnabled( + isExperimentalPathEnabled( mod->hasAttr(gpu::intel::TritonIntelGPUDialect:: getSupportSG2DBlockAttrName()) && mod->hasAttr( @@ -189,7 +189,7 @@ class TritonGPUToLLVMPipelineManager { /// FIXME: remove once the block ptr conversion path is capable of handling /// shared memory. - bool skipSharedMemoryAllocation() const { return blockPtrPathIsEnabled; } + bool skipSharedMemoryAllocation() const { return isExperimentalPathEnabled; } /// Populate the conversion pipeline for function operations. void populateFunctionConversionPatterns( @@ -197,7 +197,7 @@ class TritonGPUToLLVMPipelineManager { TritonIntelGPUToLLVMTypeConverter &typeConverter, int numWarps) const { funcPatterns.add(typeConverter, numWarps, /*benefit=*/1); - if (!blockPtrPathIsEnabled) + if (!isExperimentalPathEnabled) mlir::cf::populateControlFlowToLLVMConversionPatterns(typeConverter, funcPatterns); } @@ -216,7 +216,7 @@ class TritonGPUToLLVMPipelineManager { patterns.add(&typeConverter.getContext(), patternBenefitAddSPIRVEnv); - if (blockPtrPathIsEnabled) { + if (isExperimentalPathEnabled) { intel::populateTritonOpsToLLVMPatterns(typeConverter, patterns, benefit); intel::populateControlFlowOpToLLVMPattern(typeConverter, patterns, benefit); @@ -270,7 +270,7 @@ class TritonGPUToLLVMPipelineManager { /// Selects which conversion pipeline to use. /// FIXME: this is temporary and should be removed once we have an analysis to /// determine whether a kernel uses block pointers. - bool blockPtrPathIsEnabled = false; + bool isExperimentalPathEnabled = false; }; } // namespace mlir::triton::intel diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index a7658f08e3..2e8866740e 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -79,14 +79,14 @@ struct ConvertTritonGPUToLLVM intel::TritonGPUToLLVMPipelineManager pipelineManager(mod, context); mlir::LowerToLLVMOptions option(context); option.overrideIndexBitwidth(32); - bool isBlockPtrPathEnabled = + bool isExperimentalPathEnabled = mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: getSupportSG2DBlockAttrName()) && mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect:: getSupportDPASAttrName()) && mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR"); TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, - isBlockPtrPathEnabled); + isExperimentalPathEnabled); TritonLLVMConversionTarget convTarget(*context); int numWarps = triton::gpu::TritonGPUDialect::getNumWarps(mod); int numCTAs = triton::gpu::TritonGPUDialect::getNumCTAs(mod); @@ -102,8 +102,8 @@ struct ConvertTritonGPUToLLVM // Lower functions { mlir::LowerToLLVMOptions option(context); - TritonIntelGPUToLLVMTypeConverter typeConverter(context, option, - isBlockPtrPathEnabled); + TritonIntelGPUToLLVMTypeConverter typeConverter( + context, option, isExperimentalPathEnabled); TritonLLVMFunctionConversionTarget funcTarget(*context); RewritePatternSet funcPatterns(context); pipelineManager.populateFunctionConversionPatterns( diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp index e11e3579d4..9d898d20be 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TypeConverter.cpp @@ -10,12 +10,12 @@ #include "triton/Tools/Sys/GetEnv.hpp" TritonIntelGPUToLLVMTypeConverter::TritonIntelGPUToLLVMTypeConverter( - MLIRContext *ctx, LowerToLLVMOptions &option, bool isBlockPtrPathEnabled, - const DataLayoutAnalysis *analysis) + MLIRContext *ctx, LowerToLLVMOptions &option, + bool isExperimentalPathEnabled, const DataLayoutAnalysis *analysis) : TritonGPUToLLVMTypeConverter(ctx, option, analysis) { // Augment/overwrite type conversions required for the Intel conversion // passes. - if (isBlockPtrPathEnabled) { + if (isExperimentalPathEnabled) { // tt::pointer to v2i32. addConversion([&](PointerType type) -> std::optional { if (isa(type.getPointeeType())) {