diff --git a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir index 8f209712ff..477d403c9d 100644 --- a/test/TritonIntelGPU/tritonintelgpu-invalid.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-invalid.mlir @@ -130,6 +130,15 @@ tt.func @triton_intel_gpu.extract(%ptr : !tt.ptr>) { // ----- +tt.func @triton_intel_gpu.prefetch(%arg0: !tt.ptr>, %arg1: tensor<4x32xi1>) { + // expected-note@-1 {{prior use here}} + // expected-error@+1 {{use of value '%arg1' expects different type than prior uses: 'tensor<2x32xi1>' vs 'tensor<4x32xi1>'}} + triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + tt.return +} + +// ----- + #warp = #triton_intel_gpu.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}> module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { diff --git a/test/TritonIntelGPU/tritonintelgpu.mlir b/test/TritonIntelGPU/tritonintelgpu.mlir index 3d486780b2..6311789e49 100644 --- a/test/TritonIntelGPU/tritonintelgpu.mlir +++ b/test/TritonIntelGPU/tritonintelgpu.mlir @@ -50,6 +50,17 @@ tt.func @simplify_scf_for(%arg0: tensor<16x8xf16>, %arg1: tensor<16x8xf16>, %arg // ----- +tt.func @triton_intel_gpu.prefetch(%arg0: !tt.ptr>, %arg1: tensor<2x32xi1>) { + // CHECK-LABEL: @triton_intel_gpu.prefetch + // CHECK: triton_intel_gpu.prefetch %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + triton_intel_gpu.prefetch %arg0 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + // CHECK: triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + triton_intel_gpu.prefetch %arg0, %arg1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : !tt.ptr> + tt.return +} + +// ----- + module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { tt.func @triton_intel_gpu.sub_group_transpose(%local_buffer : !tt.ptr, %src : tensor<16x16xf16>) -> tensor<16x16xf16> { // CHECK-LABEL: @triton_intel_gpu.sub_group_transpose diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td index 85d33ed5c5..cbc3e63d13 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td @@ -107,7 +107,10 @@ def TTIG_ExtractOp : TTIG_Op<"extract", [Pure]> { let hasFolder = 1; } -def TTIG_PrefetchOp : TTIG_Op<"prefetch"> { +def TTIG_PrefetchOp : TTIG_Op<"prefetch", [ + TypesMatchWith<"mask type matches ptr type", "ptr", "mask", "getI1SameShape(getPointeeType($_self))", + "($_op.getOperands().size() <= 1) || std::equal_to<>()">, +]> { let summary = "Tensor prefetch operation"; let description = [{ The `prefetch` operation prefetches an input tensor. @@ -117,11 +120,20 @@ def TTIG_PrefetchOp : TTIG_Op<"prefetch"> { : !tt.ptr ``` }]; - let arguments = (ins AnyTypeOf<[TT_PtrLike, TT_TensorPtr]>:$ptr, TT_CacheModifierAttr:$cache, - TT_EvictionPolicyAttr:$evict, BoolAttr:$isVolatile); + let arguments = ( + ins AnyTypeOf<[TT_PtrLike, TT_TensorPtr]>:$ptr, + Optional:$mask, + TT_CacheModifierAttr:$cache, + TT_EvictionPolicyAttr:$evict, + BoolAttr:$isVolatile + ); let results = (outs); + let builders = [ + OpBuilder<(ins "Value":$ptr, "triton::CacheModifier":$cache, + "triton::EvictionPolicy":$evict, "bool":$isVolatile)> + ]; let assemblyFormat = [{ - operands attr-dict `:` type($ptr) + $ptr (`,` $mask^)? attr-dict `:` type($ptr) }]; } diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp index c89b76a491..b025fdc253 100644 --- a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp +++ b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp @@ -197,6 +197,12 @@ OpFoldResult ExtractOp::fold(FoldAdaptor adaptor) { return {}; } +void PrefetchOp::build(OpBuilder &builder, OperationState &state, Value ptr, + CacheModifier cache, EvictionPolicy evict, + bool isVolatile) { + PrefetchOp::build(builder, state, ptr, /*mask=*/{}, cache, evict, isVolatile); +} + LogicalResult SubGroupTransposeOp::verify() { RankedTensorType srcType = getSrc().getType(); auto mod = getOperation()->getParentOfType();