diff --git a/test/Conversion/intel/arith_to_llvm.mlir b/test/Conversion/intel/arith_to_llvm.mlir
index d2f7d42181..4bb9df30ad 100644
--- a/test/Conversion/intel/arith_to_llvm.mlir
+++ b/test/Conversion/intel/arith_to_llvm.mlir
@@ -12,7 +12,7 @@
 // CHECK-LABEL:   llvm.func spir_kernelcc @float_to_bfloat_conversion(
 // CHECK-SCALAR:                                             %[[VAL_0:.*]]: !llvm.struct<(f32, f32, f32, f32)>) -> !llvm.struct<(bf16, bf16, bf16, bf16)>
 // CHECK-VECTOR:                                             %[[VAL_0:.*]]: vector<32xf32>) -> vector<32xbf16>
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
   tt.func @float_to_bfloat_conversion(%arg0 : tensor<512xf32, #blocked>) ->  tensor<512xbf16, #blocked>{
 // CHECK-SCALAR:    %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32, f32, f32, f32)>
 // CHECK-SCALAR:    %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f32, f32, f32, f32)>
diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir
index 44c0fc563f..985b247a94 100644
--- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir
+++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr.mlir
@@ -1,6 +1,6 @@
 // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --split-input-file | FileCheck %s
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 1 : i32} {
   // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {passthrough = ["convergent"]}
   // CHECK-DAG: llvm.func spir_funccc @_Z42intel_sub_group_2d_block_read_16b_32r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]}
   // CHECK-DAG: llvm.func spir_funccc @_Z52intel_sub_group_2d_block_read_transform_16b_32r16x2cPU3AS1viiiDv2_iPj(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {passthrough = ["nounwind"]}
@@ -112,7 +112,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa
 
 // COM: Checks the correct lowering of the A operand load for TF32, i.e. using 4xi32 and vnni=false.
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32(
   // CHECK-SAME:                                                                  [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} {
   tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr<f32>) {
@@ -144,7 +144,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa
 
 // COM: Checks the correct lowering of a 16-bit 2D-block-store.
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu(
   // CHECK-SAME:                                                                     [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} {
   tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr<f16>) {
diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir
index dfecfda3f1..72ff792237 100644
--- a/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir
+++ b/test/Conversion/intel/tritongpu_to_llvm_intel_block_ptr_invalid.mlir
@@ -1,6 +1,6 @@
 // RUN: TRITON_INTEL_ENABLE_BLOCK_PTR=1 triton-opt %s --convert-triton-intel-gpu-to-llvm --verify-diagnostics --split-input-file
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
@@ -14,7 +14,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa
 
 // -----
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
@@ -28,7 +28,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa
 
 // -----
 
-module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f32>, %arg1: i64, %arg2: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
diff --git a/test/TritonIntelGPU/accelerate-matmul-ats.mlir b/test/TritonIntelGPU/accelerate-matmul-ats.mlir
index bd31aab76f..65327ab3fe 100644
--- a/test/TritonIntelGPU/accelerate-matmul-ats.mlir
+++ b/test/TritonIntelGPU/accelerate-matmul-ats.mlir
@@ -1,10 +1,10 @@
 // RUN: triton-opt %s -split-input-file --tritonintelgpu-accelerate-matmul | FileCheck %s
 
-// CHECK-NOT: dpas
+// CHECK-NOT: triton_intel_gpu.dpas
 #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: mma_chain_loop
   tt.func public @mma_chain_loop(
    %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -39,11 +39,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-
 
 // -----
 
-// CHECK-NOT: dpas
+// CHECK-NOT: triton_intel_gpu.dpas
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: chained_dot
   tt.func public @chained_dot(
     %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -67,7 +67,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-
 #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: mma_chain_loop_ats
   tt.func public @mma_chain_loop_ats(
    %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -112,7 +112,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.ATS", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: chained_dot
   tt.func public @chained_dot(
     %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
index 8fb069b2db..6c49636a96 100644
--- a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
+++ b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
@@ -4,7 +4,7 @@
 #blocked = #triton_gpu.blocked<{sizePerThread = [8, 4], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [4, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: mma_chain_loop
   tt.func public @mma_chain_loop(
    %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -49,7 +49,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 8], threadsPerWarp = [1, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [2, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: chained_dot
   tt.func public @chained_dot(
     %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -71,11 +71,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 
 // -----
 
-// CHECK-NOT: dpas
+// CHECK-NOT: triton_intel_gpu.dpas
 #blocked = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [4, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [1, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 8], threadsPerWarp = [2, 4], warpsPerCTA = [4, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: mma_chain_loop_ats
   tt.func public @mma_chain_loop_ats(
    %170: tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
@@ -110,11 +110,11 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 
 // -----
 
-// CHECK-NOT: dpas
+// CHECK-NOT: triton_intel_gpu.dpas
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked1 = #triton_gpu.blocked<{sizePerThread = [4, 16], threadsPerWarp = [1, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 32], threadsPerWarp = [2, 4], warpsPerCTA = [8, 1], order = [1, 0]}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 8 : i32, "triton_intel_gpu.support_dpas"} {
   // CHECK: chained_dot
   tt.func public @chained_dot(
     %arg0: tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #blocked}>>,
diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
index d155007299..d6f43af96d 100644
--- a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
+++ b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
@@ -11,7 +11,7 @@
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i64, %arg7: i32, %arg8: i64) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -85,7 +85,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -153,7 +153,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg13: !tt.ptr<f16>, %arg14: !tt.ptr<f32>) {
     %c8_i32 = arith.constant 8 : i32
     %c64_i32 = arith.constant 64 : i32
@@ -233,7 +233,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     %c1_i64 = arith.constant 1 : i64
     %c0_i32 = arith.constant 0 : i32
diff --git a/test/TritonIntelGPU/loop-pipeline.mlir b/test/TritonIntelGPU/loop-pipeline.mlir
index f4fbd24b1d..aaab478dbf 100644
--- a/test/TritonIntelGPU/loop-pipeline.mlir
+++ b/test/TritonIntelGPU/loop-pipeline.mlir
@@ -9,7 +9,7 @@
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     // CHECK-LABEL:   tt.func public @matmul_kernel
     %c8_i32 = arith.constant 8 : i32
@@ -127,7 +127,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
 
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) {
     // CHECK-LABEL:   tt.func public @matmul_kernel
     %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #dpas>
diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
index 28b3c77e46..4892038fa2 100644
--- a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
+++ b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -9,7 +9,7 @@
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) {
     // CHECK:  @matmul_kernel_with_block_pointers
     %c4_i32 = arith.constant 4 : i32
@@ -81,7 +81,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32},
                                                     %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32},
                                                     %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32},
@@ -152,7 +152,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [16, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
 #dot0 = #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth=2}>
 #dot1 = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth=2}>
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-warps" = 64 : i32, "triton_gpu.threads-per-warp" = 16 : i32, "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel_with_block_pointers_indivisible(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32, %arg7: i32, %arg8: i32 {tt.divisibility = 16 : i32}) {
     // CHECK:  @matmul_kernel_with_block_pointers_indivisible
     %c4_i32 = arith.constant 4 : i32
@@ -214,7 +214,7 @@ module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_gpu.num-
 
 // COM: Case 3:
 // COM: Check that operations using block pointers without a layout attribute are rewritten to use a legacy pointer.
-module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC"} {
+module attributes {"triton_gpu.target" = "xpu:DEVICE_ARCH.PVC", "triton_intel_gpu.support_sg_2d_block"} {
   tt.func public @matmul_kernel(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
     %c31_i32 = arith.constant 31 : i32
     %c127_i32 = arith.constant 127 : i32
diff --git a/test/TritonIntelGPU/triton_annotate_module.mlir b/test/TritonIntelGPU/triton_annotate_module.mlir
index 346f7db573..2c6758d516 100644
--- a/test/TritonIntelGPU/triton_annotate_module.mlir
+++ b/test/TritonIntelGPU/triton_annotate_module.mlir
@@ -1,8 +1,8 @@
-// RUN: triton-opt %s --split-input-file -triton-annotate-module='target=xpu:DEVICE_ARCH.PVC threads-per-warp=32' | FileCheck %s
+// RUN: triton-opt %s --split-input-file -triton-annotate-module='target=xpu:DEVICE_ARCH.PVC support-sg-2d-block=true support-dpas=true threads-per-warp=32' | FileCheck %s
 
 module {
   // COM: Ensure that the 'threads-per-warp' attribute is set according to the option.
-  // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 32 : i32}
+  // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 32 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block}
   tt.func @kernel() {
     tt.return
   }
@@ -13,7 +13,7 @@ module {
 module {
   // COM: Ensure that the 'threads-per-warp' attribute is overwritten when the kernel contains a 'tt.dot'
   //      operation that can be lowered to DPAS instructions.
-  // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 16 : i32}
+  // CHECK: module attributes {triton_gpu.target = "xpu:DEVICE_ARCH.PVC", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block}
   tt.func @kernel() {
     %a = arith.constant dense<1.00e+00> : tensor<128x32xf16>
     %b = arith.constant dense<2.00e+00> : tensor<32x128xf16>
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 9b747208fc..46e3049e2c 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -98,7 +98,6 @@ def __init__(self, target: tuple) -> None:
         mod = compile_module_from_src(Path(os.path.join(dirname, "arch_parser.c")).read_text(), "arch_utils")
         self.parse_device_arch = mod.parse_device_arch
         self.properties = self.parse_target(target.arch)
-        self.device_arch = self.properties["device_arch"]
         self.binary_ext = "spv"
 
     def parse_target(self, tgt_prop) -> dict:
@@ -106,7 +105,6 @@ def parse_target(self, tgt_prop) -> dict:
         dev_prop['name'] = tgt_prop.get('name', 'xpu')
         dev_prop['platform_name'] = tgt_prop.get('platform_name', None)
         dev_prop['vendor'] = tgt_prop.get('vendor', None)
-        dev_prop['driver_version'] = tgt_prop.get('driver_version', None)
         dev_prop['version'] = tgt_prop.get('version', None)
         dev_prop['gpu_eu_count'] = tgt_prop.get('gpu_eu_count', None)
         dev_prop['gpu_subslice_count'] = tgt_prop.get('gpu_subslice_count', None)
@@ -152,19 +150,20 @@ def make_ttir(mod, metadata, opt):
         return mod
 
     @staticmethod
-    def make_ttgir(mod, metadata, opt, device_arch):
+    def make_ttgir(mod, metadata, opt, properties):
         cluster_info = intel.ClusterInfo()
         if opt.cluster_dims is not None:
             cluster_info.clusterDimX = opt.cluster_dims[0]
             cluster_info.clusterDimY = opt.cluster_dims[1]
             cluster_info.clusterDimZ = opt.cluster_dims[2]
 
-        is_lts = Version(metadata["target"].arch["driver_version"]) == Version("1.3.27642")
-
         # Annotate module with information required by subsequent transformations.
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        intel.passes.ttgpuir.add_triton_annotate_module(pm, f"xpu:{device_arch}", is_lts, opt.threads_per_warp)
+        device_arch = properties["device_arch"]
+        intel.passes.ttgpuir.add_triton_annotate_module(pm, f"xpu:{device_arch}",
+                                                        properties["support_cl_sg_2d_block_io"],
+                                                        properties["support_cl_sg_matmul_acc"], opt.threads_per_warp)
         pm.run(mod)
 
         # Overwrite the threads_per_warp option with the module annotation.
@@ -174,7 +173,8 @@ def make_ttgir(mod, metadata, opt, device_arch):
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
 
-        if (not is_lts and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"):
+        if (properties["support_cl_sg_2d_block_io"] and properties["support_cl_sg_matmul_acc"]
+                and os.getenv("TRITON_INTEL_ENABLE_BLOCK_PTR", "0") == "1"):
             return XPUBackend.AdvancedPath.make_ttgir(mod, metadata, opt)
 
         passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
@@ -250,7 +250,7 @@ def make_spv(src, metadata):
 
     def add_stages(self, stages, options):
         stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options)
-        stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.device_arch)
+        stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, self.properties)
         stages["llir"] = lambda src, metadata: self.make_llir(src, metadata, options)
         stages["spv"] = lambda src, metadata: self.make_spv(src, metadata)
 
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
index 75079f88b6..4f26265afe 100644
--- a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
+++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
@@ -24,9 +24,16 @@ def TritonIntelGPU_Dialect : Dialect {
       return ::llvm::StringLiteral("triton_gpu.target");
     }
 
-    /// Get the name of the attribute used to indicate the LTS driver.
-    static ::llvm::StringLiteral getLTSAttrName() {
-      return ::llvm::StringLiteral("triton_gpu.is_lts");
+    /// Get the name of the attribute used to indicate whether subgroup 2D block
+    /// operations (e.g., 2D block read/write) are available.
+    static llvm::StringRef getSupportSG2DBlockAttrName() {
+      return "triton_intel_gpu.support_sg_2d_block";
+    }
+
+    /// Get the name of the attribute used to indicate whether the DPAS
+    /// instruction is available.
+    static llvm::StringRef getSupportDPASAttrName() {
+      return "triton_intel_gpu.support_dpas";
     }
   }];
 
diff --git a/third_party/intel/include/TritonAnnotateModule/Passes.td b/third_party/intel/include/TritonAnnotateModule/Passes.td
index 0bf5a98493..f5df31b4b4 100644
--- a/third_party/intel/include/TritonAnnotateModule/Passes.td
+++ b/third_party/intel/include/TritonAnnotateModule/Passes.td
@@ -28,8 +28,10 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> {
     Option<"target", "target",
            "std::string", /*default*/"\"\"",
            "the GPU target, e.g., PVC">,
-    Option<"isLTS", "is-lts", "bool", /*default*/"false",
-           "whether we are using an LTS driver">,
+    Option<"supportSG2DBlock", "support-sg-2d-block", "bool", /*default*/"false",
+           "whether subgroup 2D block operations (e.g., 2D block read/write) are available">,
+    Option<"supportDPAS", "support-dpas", "bool", /*default*/"false",
+           "whether DPAS instruction is available">,
     Option<"threadsPerWarp", "threads-per-warp",
            "unsigned", /*default*/"32",
            "number of threads per warp (aka subgroup size)">,
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
index 22f5a95b8a..b844ee3076 100644
--- a/third_party/intel/lib/Analysis/DPAS.cpp
+++ b/third_party/intel/lib/Analysis/DPAS.cpp
@@ -10,7 +10,8 @@ DPASAnalysis::DPASAnalysis(Operation *root) {
     mod = root->getParentOfType<ModuleOp>();
 
   DeviceArch arch = getDeviceArch(mod);
-  bool isLTS = mod->hasAttr(TritonIntelGPUDialect::getLTSAttrName());
+  bool supportDPAS =
+      mod->hasAttr(TritonIntelGPUDialect::getSupportDPASAttrName());
 
   // Populate the maps.
   mod.walk([&](FunctionOpInterface funcOp) {
@@ -22,9 +23,10 @@ DPASAnalysis::DPASAnalysis(Operation *root) {
       else
         funcToDotMap[funcOp] = {dotOp};
 
-      DPASEngineType dpasEngineType = (isLTS || arch == DeviceArch::UNKNOWN)
-                                          ? DPASEngineType::NOT_APPLICABLE
-                                          : DPASAnalysis::getDPASType(dotOp);
+      DPASEngineType dpasEngineType =
+          (!supportDPAS || arch == DeviceArch::UNKNOWN)
+              ? DPASEngineType::NOT_APPLICABLE
+              : DPASAnalysis::getDPASType(dotOp);
       dotToDPASEngineMap[dotOp] = dpasEngineType;
 
       // Only PVC supports TF32.
diff --git a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
index acb70e882c..93a11fa769 100644
--- a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
+++ b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
@@ -28,10 +28,11 @@ struct TritonAnnotateModule
     mod->setAttr(intel::TritonIntelGPUDialect::getTargetAttrName(),
                  builder.getStringAttr(target.getValue()));
 
-    // FIXME: Use SYCL runtime to query supported OpenCL extensions, instead
-    // of checking driver version.
-    if (isLTS)
-      mod->setAttr(intel::TritonIntelGPUDialect::getLTSAttrName(),
+    if (supportSG2DBlock)
+      mod->setAttr(intel::TritonIntelGPUDialect::getSupportSG2DBlockAttrName(),
+                   builder.getUnitAttr());
+    if (supportDPAS)
+      mod->setAttr(intel::TritonIntelGPUDialect::getSupportDPASAttrName(),
                    builder.getUnitAttr());
 
     DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
index b2cd02a2d6..1f0dcc0915 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
@@ -181,7 +181,10 @@ class TritonGPUToLLVMPipelineManager {
   TritonGPUToLLVMPipelineManager(ModuleOp &mod, MLIRContext *ctx)
       : mod(mod), ctx(ctx),
         isAdvancedPathEnabled(
-            !mod->hasAttr("triton_gpu.is_lts") &&
+            mod->hasAttr(gpu::intel::TritonIntelGPUDialect::
+                             getSupportSG2DBlockAttrName()) &&
+            mod->hasAttr(
+                gpu::intel::TritonIntelGPUDialect::getSupportDPASAttrName()) &&
             mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR")) {}
 
   /// FIXME: remove once the block ptr conversion path is capable of handling
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
index 6257bf1442..d22ad3ec51 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
@@ -80,7 +80,10 @@ struct ConvertTritonGPUToLLVM
     mlir::LowerToLLVMOptions option(context);
     option.overrideIndexBitwidth(32);
     bool isAdvancedPathEnabled =
-        !mod->hasAttr("triton_gpu.is_lts") &&
+        mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
+                         getSupportSG2DBlockAttrName()) &&
+        mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
+                         getSupportDPASAttrName()) &&
         mlir::triton::tools::getBoolEnv("TRITON_INTEL_ENABLE_BLOCK_PTR");
     TritonIntelGPUToLLVMTypeConverter typeConverter(context, option,
                                                     isAdvancedPathEnabled);
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
index 46580796b1..27694844e0 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp
@@ -66,7 +66,7 @@ struct IntelGPUPipelinePass
   void runOnOperation() override {
     ModuleOp m = getOperation();
 
-    if (m->hasAttr("triton_gpu.is_lts"))
+    if (!m->hasAttr(ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName()))
       return;
 
     auto deviceArch = ttgi::getDeviceArch(m);
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
index 856650c9a5..2a781589c4 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -795,7 +795,8 @@ void LayoutPropagation::rewriteAssertOp(AssertOp assertOp) {
 
 bool LayoutPropagation::rewriteStoreOp(StoreOp storeOp) {
   // Disable 2D block store on LTS.
-  if (storeOp->getParentOfType<ModuleOp>()->hasAttr("triton_gpu.is_lts"))
+  if (!storeOp->getParentOfType<ModuleOp>()->hasAttr(
+          ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName()))
     return false;
 
   // If storeOp is a pointer to a tensor, we try to find out if the
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
index 7b6e6f3e58..9e7cefdf4e 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -2,6 +2,7 @@
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 
@@ -59,7 +60,8 @@ bool isDivisible(Value value, unsigned divisor) {
 ///   - the tensor pointer is not contiguous on memory
 bool shouldRemove(tt::MakeTensorPtrOp &op, ttgi::DeviceArch deviceArch,
                   bool isUsedByStoreOp) {
-  if (op->getParentOfType<ModuleOp>()->hasAttr("triton_gpu.is_lts"))
+  if (!op->getParentOfType<ModuleOp>()->hasAttr(
+          ttgi::TritonIntelGPUDialect::getSupportSG2DBlockAttrName()))
     return true;
 
   // Non-PVC device should always remove the tensor pointer
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index 6a0ac47f6c..6aaa1b2221 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -33,10 +33,11 @@ using ret = py::return_value_policy;
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1) {                  \
     pm.addPass(builder({val0, val1}));                                         \
   })
-#define ADD_PASS_WRAPPER_OPT_3(name, builder, ty0, ty1, ty2)                   \
-  m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) {        \
-    pm.addPass(builder({val0, val1, val2}));                                   \
-  })
+#define ADD_PASS_WRAPPER_OPT_4(name, builder, ty0, ty1, ty2, ty3)              \
+  m.def(name,                                                                  \
+        [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, ty3 val3) {    \
+          pm.addPass(builder({val0, val1, val2, val3}));                       \
+        })
 
 static uint32_t findKernels(llvm::Module &M,
                             std::set<llvm::Function *> &functions) {
@@ -83,9 +84,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUDistributeToWarps);
   ADD_PASS_WRAPPER_0("add_match_target_size",
                      gpu::intel::createTritonIntelGPUMatchTargetSize);
-  ADD_PASS_WRAPPER_OPT_3("add_triton_annotate_module",
+  ADD_PASS_WRAPPER_OPT_4("add_triton_annotate_module",
                          gpu::intel::createTritonAnnotateModule,
-                         const std::string &, bool, unsigned);
+                         const std::string &, bool, bool, unsigned);
 }
 
 void init_triton_intel(py::module &&m) {