From 4ad66757e3bb77485a89673d6e063a64aac9be71 Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 17 Jun 2024 14:51:41 +0000 Subject: [PATCH 01/36] [AMD-Pipeliner] Transition stream-pipeline to new SW pipelining infrastructure - Copied scheduler from MatmulLoopPipeline (much could be consolidated) - Enable register buffering (even though may increases register pressure) - Enable num_stages=2+, including multi-buffering, and make `2` the default - updated tutorial for new tuning default - added lit tests --- python/tutorials/03-matrix-multiplication.py | 10 +- test/TritonGPU/amd/amd-loop-pipeline.mlir | 1636 +++++++++++++++++ third_party/amd/backend/compiler.py | 9 +- .../include/TritonAMDGPUTransforms/Passes.h | 2 +- .../include/TritonAMDGPUTransforms/Passes.td | 6 + .../TritonAMDGPUTransforms/StreamPipeline.cpp | 1505 +++++++-------- third_party/amd/python/triton_amd.cc | 4 +- 7 files changed, 2418 insertions(+), 754 deletions(-) create mode 100644 test/TritonGPU/amd/amd-loop-pipeline.mlir diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py index 91f751207b8e..8153509055f9 100644 --- a/python/tutorials/03-matrix-multiplication.py +++ b/python/tutorials/03-matrix-multiplication.py @@ -206,19 +206,19 @@ def get_hip_autotune_config(): return [ triton.Config( {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, - num_warps=4, num_stages=0), + num_warps=4, num_stages=2), triton.Config( {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 4, 'waves_per_eu': 2}, - num_warps=8, num_stages=0), + num_warps=8, num_stages=2), triton.Config( {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, - num_warps=8, num_stages=0), + num_warps=8, num_stages=2), triton.Config( {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'waves_per_eu': 3}, - num_warps=4, num_stages=0), + num_warps=4, num_stages=2), triton.Config( {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, 'waves_per_eu': 8}, - num_warps=4, num_stages=0), + num_warps=4, num_stages=2), ] diff --git a/test/TritonGPU/amd/amd-loop-pipeline.mlir b/test/TritonGPU/amd/amd-loop-pipeline.mlir new file mode 100644 index 000000000000..fbad2dc50ce5 --- /dev/null +++ b/test/TritonGPU/amd/amd-loop-pipeline.mlir @@ -0,0 +1,1636 @@ +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 | FileCheck %s + +// 4 warps +// matmul: 128x32 @ 32x128 -> 128x128 +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +// CHECK-LABEL: tt.func @matmul_loop +// CHECK: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_25]] +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[MULF_29:.*]] = arith.mulf %[[CONVERT_LAYOUT_28]], %{{.*}} +// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[MULF_29]], %[[ARG8]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] +// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } + +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] + +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +tt.func @matmul_loop(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %b_scale = arith.constant dense<4.> : tensor<32x128xf16, #B> + + %loop:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + %b__ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b_ = triton_gpu.convert_layout %b__ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + %b = arith.mulf %b_, %b_scale: tensor<32x128xf16, #B> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + tt.return %loop#2: tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @matmul_loop_nested +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] +// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) + +// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] +// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} +// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_26]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[CONVERT_LAYOUT_29]], %[[ARG10]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} +// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// CHECK: scf.yield %{{.*}}#2 +// CHECK: } +tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ + + %c_start = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + %loop1:1 = scf.for %iv0 = %lb to %ub step %step iter_args(%c_init = %c_start) -> (tensor<128x128xf32, #C>) { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %loop2:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %a_ = tt.load %a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + + scf.yield %loop2#2 : tensor<128x128xf32, #C> + } + tt.return %loop1#0 : tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @matmul_loop_single_pipeline +// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] +// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// CHECK: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] +// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] +// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[CONVERT_LAYOUT_24]], %[[ARG7]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} +// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] +// CHECK: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + + %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_b_ptr, %c : tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + tt.return %loop#1 : tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_scalar +// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// CHECK: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] +// CHECK: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] +// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] +// CHECK: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] +// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] +// CHECK: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] +// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] +// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] +// CHECK: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] +// CHECK: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] +// CHECK: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] +// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] +// CHECK: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] +// CHECK: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] +// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) + +// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_30]] +// CHECK: %[[CONVERT_LAYOUT_33:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_31]] +// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[CONVERT_LAYOUT_33]], %[[ARG7]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] +// CHECK: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] +// CHECK: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] +// CHECK: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] +// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] +// CHECK: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] +// CHECK: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} +// CHECK: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] +// CHECK: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// CHECK: } + +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + +tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: !tt.ptr, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : !tt.ptr + %84 = arith.muli %77, %83 : i64 + %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one +// CHECK: %[[LOAD_0:.*]] = tt.load %{{.*}} +// CHECK: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] +// CHECK: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] +// CHECK: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] +// CHECK: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] +// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] +// CHECK: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] +// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] +// CHECK: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] +// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] +// CHECK: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) + +// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] +// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] +// CHECK: %[[DOT_26:.*]] = tt.dot %[[CONVERT_LAYOUT_24]], %[[CONVERT_LAYOUT_25]], %[[ARG7]] +// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] +// CHECK: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// CHECK: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] +// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] + +tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: !tt.ptr, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %50 = tt.load %75 : !tt.ptr + %51 = tt.addptr %75, %c1_i32 : !tt.ptr, i32 + %79:4 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %51, %arg22 = %50) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : !tt.ptr + %84 = arith.muli %77, %arg22 : i64 + %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 + scf.yield %90, %91, %92, %83 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64 + } + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_vector +// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// CHECK: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] +// CHECK: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] +// CHECK: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] +// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] +// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + +tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @post_load_inv +// CHECK: scf.for +// CHECK-DAG: %[[IV:.*]] = arith.index_cast +// CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 +// CHECK: arith.index_cast +// CHECK-NOT: arith.addi %[[NEXT_IV]] +tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}, + %arg3: i32 {tt.divisibility = 16 : i32}, + %arg4: i32 {tt.divisibility = 16 : i32}, + %arg5: i32 {tt.divisibility = 16 : i32}, + %arg6: i32 {tt.divisibility = 16 : i32}, + %arg7: i32 {tt.divisibility = 16 : i32}, + %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { + %c0_index = arith.constant 0 : index + %c1_index = arith.constant 1 : index + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %84 = arith.constant 900 : index + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> + %50 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> + %59 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %81 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %66 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %82 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %85:3 = scf.for %arg9 = %c0_index to %84 step %c1_index iter_args(%arg10 = %cst, %arg11 = %59, %arg12 = %81) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { + %130 = arith.index_cast %arg9 : index to i32 + %107 = arith.muli %130, %c32_i32 : i32 + %108 = arith.subi %arg5, %107 : i32 + %109 = tt.splat %108 : i32 -> tensor<1x32xi32, #AL> + %110 = arith.cmpi "slt", %50, %109 : tensor<1x32xi32, #AL> + %111 = tt.broadcast %110 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> + %112 = tt.load %arg11, %111, %cst_0 : tensor<32x32x!tt.ptr, #AL> + %113 = tt.splat %108 : i32 -> tensor<32x1xi32, #AL> + %114 = arith.cmpi "slt", %66, %113 : tensor<32x1xi32, #AL> + %115 = tt.broadcast %114 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> + %116 = tt.load %arg12, %115, %cst_0 : tensor<32x32x!tt.ptr, #AL> + %117 = triton_gpu.convert_layout %112 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> + %118 = triton_gpu.convert_layout %116 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> + %119 = tt.dot %117, %118, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> + %131 = arith.index_cast %arg9 : index to i32 + %120 = arith.addi %131, %c1_i32 : i32 + %121 = arith.muli %120, %c32_i32 : i32 + %122 = tt.splat %121 : i32 -> tensor<32x32xi32, #AL> + %123 = tt.addptr %60, %122 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + %124 = arith.muli %121, %arg7 : i32 + %125 = tt.splat %124 : i32 -> tensor<32x32xi32, #AL> + %126 = tt.addptr %82, %125 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + scf.yield %119, %123, %126 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> + } + tt.return %85#0 : tensor<32x32xf32, #C> +} + +// CHECK-LABEL: tt.func @cross_iter_dep +// TODO: enable pipelining with distance of 2 +// CHECK-NOT: triton_gpu.local_load +// CHECK: scf.for +// CHECK: scf.yield +tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}, + %arg3: i32 {tt.divisibility = 16 : i32}, + %arg4: i32 {tt.divisibility = 16 : i32}, + %arg5: i32 {tt.divisibility = 16 : i32}, + %arg6: i32 {tt.divisibility = 16 : i32}, + %arg7: i32 {tt.divisibility = 16 : i32}, + %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { + %c0_i32 = arith.constant 0 : index + %118 = arith.constant 32 : index + %c1_i32 = arith.constant 1 : index + %c2_i32 = arith.constant 2 : i32 + %c32_i32 = arith.constant 32 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> + %78 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %110 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %112 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %113 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %116 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %65 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> + %88 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> + %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %119:5 = scf.for %arg9 = %c0_i32 to %118 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %78, %arg12 = %110, %arg13 = %113, %arg14 = %116) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { + %161 = arith.index_cast %arg9 : index to i32 + %141 = arith.muli %161, %c32_i32 : i32 + %142 = arith.subi %arg5, %141 : i32 + %143 = tt.splat %142 : i32 -> tensor<1x32xi32, #AL> + %144 = arith.cmpi "slt", %65, %143 : tensor<1x32xi32, #AL> + %145 = tt.broadcast %144 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> + %146 = tt.load %arg11, %145, %cst_1 : tensor<32x32x!tt.ptr, #AL> + %147 = tt.splat %142 : i32 -> tensor<32x1xi32, #AL> + %148 = arith.cmpi "slt", %88, %147 : tensor<32x1xi32, #AL> + %149 = tt.broadcast %148 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> + %150 = tt.load %arg12, %149, %cst_1 : tensor<32x32x!tt.ptr, #AL> + %151 = triton_gpu.convert_layout %146 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> + %152 = triton_gpu.convert_layout %150 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> + %153 = tt.dot %151, %152, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> + %162 = arith.index_cast %arg9 : index to i32 + %154 = arith.addi %162, %c2_i32 : i32 + %155 = arith.muli %154, %c32_i32 : i32 + %156 = tt.splat %155 : i32 -> tensor<32x32xi32, #AL> + %157 = tt.addptr %80, %156 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + %158 = arith.muli %155, %arg7 : i32 + %159 = tt.splat %158 : i32 -> tensor<32x32xi32, #AL> + %160 = tt.addptr %112, %159 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + scf.yield %153, %arg13, %arg14, %157, %160 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> + } + tt.return %119#0 : tensor<32x32xf32, #C> +} + +// CHECK-LABEL: tt.func @dep_arg_two_uses +// CHECK: tt.expand_dims +// CHECK: tt.expand_dims +// CHECK: tt.expand_dims %arg5 +// CHECK-NEXT: tt.expand_dims %arg5 +// CHECK: %[[PTR0:.*]] = tt.splat %arg6 +// CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] +// CHECK-NEXT: tt.load %[[PTR1]] +tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + %23 = arith.constant 100 : index + %c64 = arith.constant 64 : i64 + %56 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %57 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %58 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %83 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %85 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> + %86 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> + %68 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %c32_index = arith.constant 32 : index + %c32_i32 = arith.index_cast %c32_index : index to i32 + %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #BL> + %88 = arith.truncf %cst_6 : tensor<32x128xf32, #BL> to tensor<32x128xf16, #BL> + %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #C> + %90 = tt.splat %c64 : i64 -> tensor<32x128xi64, #BL> + %92 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 + %c0_index = arith.constant 0 : index + %91:5 = scf.for %arg19 = %c0_index to %23 step %c32_index iter_args(%arg20 = %68, %arg21 = %83, %arg22 = %92, %arg23 = %cst, %arg24 = %80) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL>) { + %1750 = arith.subi %23, %arg19 : index + %175 = arith.index_cast %1750 : index to i32 + %176 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %177 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %178 = arith.cmpi "slt", %57, %176 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %179 = arith.cmpi "slt", %58, %177 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %180 = tt.expand_dims %178 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi1, #AL> + %181 = tt.expand_dims %179 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #BL}>> -> tensor<32x1xi1, #BL> + %182 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> + %183 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> + %184 = arith.extsi %182 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> + %185 = arith.extsi %183 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> + %186 = arith.muli %184, %85 : tensor<1x32xi64, #AL> + %187 = arith.muli %185, %86 : tensor<1x32xi64, #AL> + %188 = tt.broadcast %186 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> + %189 = tt.broadcast %187 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> + %190 = tt.addptr %arg20, %188 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> + %191 = tt.addptr %arg20, %189 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> + %192 = tt.broadcast %180 : tensor<1x32xi1, #AL> -> tensor<128x32xi1, #AL> + %193 = tt.load %191, %192 : tensor<128x32x!tt.ptr, #AL> + %194 = tt.splat %arg22 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %195 = tt.addptr %194, %56 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %196 = tt.load %195 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %197 = tt.addptr %arg22, %c32_i32 : !tt.ptr, i32 + %198 = tt.broadcast %181 : tensor<32x1xi1, #BL> -> tensor<32x128xi1, #BL> + %199 = tt.load %arg24, %198, %88 : tensor<32x128x!tt.ptr, #BL> + %200 = triton_gpu.convert_layout %193 : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> + %201 = triton_gpu.convert_layout %199 : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> + %202 = tt.dot %200, %201, %arg23 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> -> tensor<128x128xf32, #C> + %203 = tt.addptr %arg24, %90 : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi64, #BL> + scf.yield %190, %196, %197, %202, %203 : tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL> + } + tt.return %91#3 : tensor<128x128xf32, #C> +} +} // end module + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +// CHECK-LABEL: tt.func @load_two_users + tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: triton_gpu.local_store + // CHECK: scf.for + // CHECK: tt.dot + // CHECK: tt.dot + // CHECK: tt.load + // CHECK: triton_gpu.local_store + // CHECK: scf.yield + + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts + tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK-NOT: triton_gpu.local_store + // CHECK: scf.for + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: scf.for +// CHECK: triton_gpu.local_alloc +// CHECK-NOT: triton_gpu.local_alloc +// CHECK: scf.for +// CHECK: scf.yield +// CHECK-DIS: scf.yield +// +// The following code has the structure: +// +// ``` +// for { +// %a = load() +// for { +// %b = load() +// dot(%a, %b) +// } +// } +// ``` +// +// Only the outer for should be pipelined. The regression this tests +// causes an assertion to fail while pipelining the outer `for`, in +// particular while predicating the operations scheduled to be emitted +// in the prologue. +// +// We check that there is no allocation before the first occurrence of +// scf.for because that would mean that the first load `%a = load()` +// would be pipelined. +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %c10_i32 = arith.constant 10 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> + %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { + %9 = arith.muli %arg4, %c32_i32 : i32 + %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = arith.addi %10, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %13 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %14 = tt.expand_dims %12 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %15 = tt.broadcast %14 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %16 = tt.addptr %6, %15 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %17 = tt.load %16 : tensor<32x32x!tt.ptr, #blocked> + %18 = tt.expand_dims %13 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %19 = arith.muli %18, %cst_0 : tensor<32x1xi32, #blocked> + %20 = tt.addptr %7, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %21 = tt.broadcast %20 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %22 = tt.addptr %8, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %23 = tt.broadcast %22 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { + %24 = arith.muli %arg5, %c32_i32 : i32 + %25 = tt.splat %24 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %26 = arith.addi %25, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %28 = tt.broadcast %27 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %29 = tt.addptr %21, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %30 = tt.load %29 : tensor<32x32x!tt.ptr, #blocked> + %31 = triton_gpu.convert_layout %30 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %32 = triton_gpu.convert_layout %17 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %33 = tt.dot %31, %32, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %34 = tt.addptr %23, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %35 = triton_gpu.convert_layout %33 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %34, %35 : tensor<32x32x!tt.ptr, #blocked> + } + } + tt.return + } +} // end module + +// ----- + +// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de +// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.get_program_id y : i32 + %3 = tt.load %arg3 : !tt.ptr + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> + %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> + %11 = arith.extsi %arg5 : i32 to i64 + %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> + %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> + %14 = arith.muli %2, %arg5 : i32 + %15 = arith.extsi %14 : i32 to i64 + %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> + %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> + %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> + %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> + %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> + %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> + %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> + %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> + %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> + %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> + %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> + %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> + %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> + %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> + %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> + %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> + %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> + %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> + %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> + %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> + %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> + %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> + %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> + %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> + %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> + %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> + %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> + %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> + %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> + %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> + %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> + %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { + %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> + %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> + %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> + %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> + %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + scf.yield %79 : tensor<64x32xf32, #mma> + } + %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> + %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> + %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> + %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> + %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> + %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> + tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> + tt.return + } +} // end module + +// ----- +// CHECK-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> +// CHECK-LABEL: tt.func @indirect_load_shared_layout +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// CHECK: } + +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} +} + + +// ----- + +// CHECK-LABEL: @kernel_yield_constant +// CHECK: tt.load +// CHECK: triton_gpu.memdesc_subview +// CHECK: triton_gpu.local_store +// CHECK: scf.for +// CHECK: tt.load +// CHECK: triton_gpu.memdesc_subview +// CHECK: triton_gpu.local_store +// CHECK: tt.return +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> + %c32_i32 = arith.constant 32 : i32 + %c31_i32 = arith.constant 31 : i32 + %cst_1 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %0 = tt.get_program_id x : i32 + %7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = arith.addi %arg4, %c31_i32 : i32 + %13 = arith.divsi %12, %c32_i32 : i32 + %14 = tt.expand_dims %7 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %22 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %34 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %42 = scf.for %arg7 = %c0_i32 to %13 step %c1_i32 iter_args(%arg8 = %cst) -> (tensor<32x32xf32, #mma>) : i32 { + %43 = arith.muli %arg7, %c32_i32 : i32 + %44 = arith.muli %43, %arg5 : i32 + %45 = tt.splat %44 : i32 -> tensor<32x32xi32, #blocked> + %46 = tt.addptr %22, %45 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %47 = arith.subi %arg4, %43 : i32 + %48 = tt.splat %47 : i32 -> tensor<32x1xi32, #blocked> + %49 = arith.cmpi slt, %14, %48 : tensor<32x1xi32, #blocked> + %50 = tt.broadcast %49 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> + %51 = tt.load %46, %50, %cst_0 : tensor<32x32x!tt.ptr, #blocked> + %52 = triton_gpu.convert_layout %51 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %53 = tt.dot %cst_1, %52, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %54 = triton_gpu.convert_layout %53 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %34, %54 : tensor<32x32x!tt.ptr, #blocked> + scf.yield %cst1 : tensor<32x32xf32, #mma> + } + tt.return + } +} + + +// ----- + +// CHECK-LABEL: tt.func public @add_kernel +// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} +// CHECK: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] +// CHECK: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} +// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} +// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] +// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] +// CHECK: scf.for +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { + %7 = arith.addi %1, %arg4 : i32 + %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> + %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> + %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> + %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %10 : tensor<1024x!tt.ptr, #blocked> + %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %14 = tt.load %13, %10 : tensor<1024x!tt.ptr, #blocked> + %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> + %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} + tt.return + } +} + + +// ----- + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc %[[LOAD_10]] +// CHECK: %[[TRANS_12:.*]] = tt.trans %[[LOCAL_ALLOC_11]] {order = array} +// CHECK: %[[LOCAL_LOAD_13:.*]] = triton_gpu.local_load %[[TRANS_12]] +// CHECK: %[[LOCAL_ALLOC_14:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) + +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} +// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG3]], %{{.*}} +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG5]] +// CHECK: %[[CONVERT_LAYOUT_23:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] +// CHECK: %[[DOT_24:.*]] = tt.dot %[[CONVERT_LAYOUT_23]], %[[LOCAL_LOAD_13]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[DOT_24]] +// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_25]] +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_27:.*]] = tt.load %{{.*}}, %[[SPLAT_26]] +// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG4]], %{{.*}} +// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} +// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%[[SELECT_30]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: scf.yield %[[SELECT_21]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_14]] + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> + %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> + %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> + %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> + %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> + %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> + %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> + scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> + %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> + %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %16 = tt.dot %15, %13, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> + %17 = triton_gpu.convert_layout %16 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> + tt.store %9, %17 : tensor<16x16x!tt.ptr, #blocked> + } + } + tt.return + } +} + +// ----- + +// This test triggered some failure in the verifier, so we only +// included a simple check for the kernel name. +// CHECK-LABEL: @load_convert_layout +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #BLs1> + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %cst_0 = arith.constant dense<2> : tensor<16xi32, #BLs1> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %15 = arith.cmpi slt, %1, %cst_0 : tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21, %15 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} +} + + +// ----- + +// This test captured some ICE in MatmulLoopPipeline pass, so we only +// included a simple check for the kernel name. +// CHECK-LABEL: @matmul_indirect_pipeline +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %4 = tt.broadcast %2 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> + %5 = tt.broadcast %3 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %6 = arith.addi %4, %5 : tensor<32x32xi32, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %8 = tt.addptr %7, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %9 = tt.load %8 : tensor<32x32x!tt.ptr, #blocked> + %10 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %11 = tt.addptr %10, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %12 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %13 = tt.addptr %12, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %14 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %15 = tt.load %13 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %16 = tt.addptr %14, %15 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> + %19 = tt.broadcast %18 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> + %20 = arith.addf %9, %19 : tensor<32x32xf32, #blocked> + %21 = triton_gpu.convert_layout %9 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %22 = triton_gpu.convert_layout %20 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %23 = tt.dot %21, %22, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %24 = triton_gpu.convert_layout %23 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %11, %24 : tensor<32x32x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} + tt.return + } +} + +// ----- + +// CHECK-LABEL: @dont_pipeline_128x1 +// CHECK-NOT: local_load{{.*}}128x1 +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %cst_4 = arith.constant dense<-1.000000e+30> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + + %99:1 = scf.for %arg25 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg31 = %cst_4) -> (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) : i32 { + %94 = tt.splat %arg6 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> + %151 = tt.load %94 : tensor<128x1x!tt.ptr, #blocked> + %161 = triton_gpu.convert_layout %151 : tensor<128x1xi32, #blocked> -> tensor<128x1xi32, #mma> + %162 = tt.broadcast %161 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> + %170 = arith.sitofp %162 : tensor<128x64xi32, #mma> to tensor<128x64xf32, #mma> + + %173 = "tt.reduce"(%170) <{axis = 1 : i32}> ({ + ^bb0(%arg33: f32, %arg34: f32): + %207 = arith.maxnumf %arg33, %arg34 : f32 + tt.reduce.return %207 : f32 + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + %175 = arith.maxnumf %arg31, %173 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + + %201 = arith.truncf %170 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> + %202 = triton_gpu.convert_layout %201 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + + %192 = arith.constant dense<0.> : tensor<128x64xf32, #mma> + %203 = arith.constant dense<0.> : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %204 = tt.dot %202, %203, %192 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + + scf.yield %175 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + } + tt.return + } +} + +// ----- + +// Check that the dependencies across ops of different nesting does not cause crash or +// incorrect schedule that fails to pipeline. +// CHECK-LABEL: @matmul_nested_ops +// CHECK: triton_gpu.local_load + +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}, + %ext : index) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + + %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C>) { + %cnd = arith.cmpi slt, %iv, %ext : index + %inc_a_ptr = scf.if %cnd -> (tensor<128x32x!tt.ptr, #AL>) { + %a_ptr_ = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + scf.yield %a_ptr_ : tensor<128x32x!tt.ptr, #AL> + } else { + scf.yield %a_ptr : tensor<128x32x!tt.ptr, #AL> + } + %a_ = tt.load %inc_a_ptr : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %inc_a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + scf.yield %next_a_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C> + } + tt.return %loop#1: tensor<128x128xf32, #C> +} +} + +// ----- + +// Pipeline the if ops at the beginning and the end of the loop +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: dot_prologue_epilogue + // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: %[[C0:.*]] = arith.constant 0 : i32 + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] + // CHECK-NOT load + // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // CHECK: scf.if %[[CND]] + // CHECK: dot + // CHECK: scf.if %[[CND]] + // CHECK: arith.mulf + // CHECK: scf.yield + // CHECK-NOT: tt.addptr + // CHECK: scf.yield + %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { + %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> + %cnd = arith.cmpi slt, %arg3, %ext : i32 + %inc_ptr = scf.if %cnd -> tensor<64x16x!tt.ptr, #blocked> { + %ptr = tt.addptr %arg5, %inc : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + scf.yield %ptr : tensor<64x16x!tt.ptr, #blocked> + } else { + scf.yield %arg5 : tensor<64x16x!tt.ptr, #blocked> + } + %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { + %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> + scf.yield %acc_zero : tensor<128x16xf32, #mma1> + } else { + scf.yield %acc : tensor<128x16xf32, #mma1> + } + %22 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + scf.yield %acc_, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> + } + tt.return %17#0 : tensor<128x16xf32, #mma1> + } +} + +// ----- + +// Verify that uses of the ops scheduled in partucular place of the loop (like epilogue if) are correctly scheduled too. +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: pipeline_downstream_dependencies + // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst1 = arith.constant dense<1> : tensor<64x16xi32, #blocked> + %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: %[[C0:.*]] = arith.constant 0 : i32 + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] + // CHECK-NOT load + // CHECK: dot + // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // CHECK: %[[IFRET:.*]]:2 = scf.if %[[CND]] + // CHECK: arith.mulf + // CHECK: scf.yield + // CHECK: tt.addptr {{.*}}, %[[IFRET]]#1 + // CHECK: scf.yield + %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { + %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> + %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %cnd = arith.cmpi slt, %arg3, %ext : i32 + %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) { + %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> + scf.yield %acc_zero, %cst : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> + } else { + scf.yield %acc, %cst1 : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> + } + %22 = tt.addptr %arg5, %if_ret#1 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + scf.yield %if_ret#0, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> + } + tt.return %17#0 : tensor<128x16xf32, #mma1> + } +} + +// ----- + +// CHECK-LABEL: @masked_add_kernel +// CHECK: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: scf.for +// CHECK: arith.select +// CHECK: arith.select +// CHECK: arith.addf +// CHECK: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] + +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { + %7 = arith.addi %1, %arg4 : i32 + %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> + %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> + %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> + %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %10, %cst : tensor<1024x!tt.ptr, #blocked> + %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %14 = tt.load %13, %10, %cst : tensor<1024x!tt.ptr, #blocked> + %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> + %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} + tt.return + } +} diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 3227a5535842..640fdf3200ed 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -28,7 +28,7 @@ def min_dot_size(target: GPUTarget): class HIPOptions: num_warps: int = 4 waves_per_eu: int = 1 - num_stages: int = 0 + num_stages: int = 2 num_ctas: int = 1 extern_libs: dict = None cluster_dims: tuple = (1, 1, 1) @@ -149,14 +149,13 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_remove_layout_conversions(pm) amd.passes.ttgpuir.add_optimize_epilogue(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) - if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch): - amd.passes.ttgpuir.add_stream_pipeline(pm) + if amd.has_matrix_core_feature(options.arch): + amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages) passes.common.add_canonicalizer(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) - if options.num_stages != 0: - amd.passes.ttgpuir.add_reorder_instructions(pm) + amd.passes.ttgpuir.add_reorder_instructions(pm) passes.common.add_cse(pm) passes.common.add_symbol_dce(pm) pm.run(mod) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h index e7a9753b2145..914bce6fd644 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h @@ -6,7 +6,7 @@ namespace mlir { -std::unique_ptr createTritonAMDGPUStreamPipelinePass(); +std::unique_ptr createTritonAMDGPUStreamPipelinePass(int numStages = 2); std::unique_ptr createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(), diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index a818b1ac9da5..5f61e649bfdf 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -14,6 +14,12 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod let constructor = "mlir::createTritonAMDGPUStreamPipelinePass()"; let dependentDialects = []; + + let options = [ + Option<"numStages", "num_stages", + "int32_t", /*default*/"2", + "Number of Pipeline stages"> + ]; } def TritonAMDGPUAccelerateMatmul : Pass<"tritonamdgpu-accelerate-matmul", "mlir::ModuleOp"> { diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index 8bdf9d11751d..19f8eee829fe 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -3,860 +3,883 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/Transforms/Passes.h" +#include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" +#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" +#include "triton/Dialect/TritonGPU/Transforms/Schedule.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include "llvm/ADT/MapVector.h" +#include "triton/Tools/Sys/GetEnv.hpp" +#include "llvm/Support/Debug.h" + +#include //===----------------------------------------------------------------------===// -// This file implements stream software pipelining for loops. The implementation -// here is inspired by the pipeline pass in Triton and the rocMLIR pipeliner. -// -// We divide the loop body into the following phases: -// a. Pre-load operations: for instance, index computation. -// b. Load operations: loading from global memory to shared memory. -// c. Compute operations: for instance, Triton dot. -// d. Post-load operations: for instance, index computation. -// -// To pipeline the loop, we need to: -// - Find all the dependencies of the load operations. -// - Prologue: Hoist the pipelinable load operations and shared memory store -// for the ramp up stage -// - Pipelined Loop: Assemble the loop body minus last iteration -// - Prefetch next tile from global into regs (while computing from previous) -// - Non-load loop body -// - Store next tile into shared mem -// - Epilogue: Peeled non-load loop body for last iteration -// +// This file will create a schedule that will be handed over to the pipeline +// expander. +// Software pipeliners are usually separated into two pieces, one that create a +// modulo schedule and an expander that rewrites the loop and emits a prologue +// and epilogue. This pass first calls a helper that will pre-process the IR +// to create async operations and create a modulo schedule. Then we call the +// expander to generate the prologue and new loop. //===----------------------------------------------------------------------===// -using llvm::MapVector; -using namespace mlir; -namespace ttg = triton::gpu; - #define GEN_PASS_CLASSES #include "TritonAMDGPUTransforms/Passes.h.inc" -namespace { - -class LoopPipeliner { - /// Cache of ForOp and YieldOp related to this pipeliner. - scf::ForOp forOp; - scf::YieldOp yieldOp; - - bool peelLastIter = true; - - /// The new pipelined ForOp. - scf::ForOp pplForOp; - - /// Loads to be pipelined - SetVector validLoads; - /// The value that each load will be mapped to (after layout conversion) - DenseMap convertMapping; - /// load => buffer - DenseMap loadsBuffer; - /// load => buffer type (with shared layout after swizzling) - DenseMap loadsBufferType; - - /// Iterator values - Value nextLoopCond; - - /// Yield values - SmallVector yieldValues; - - /// The number of stages in the pipeline is fixed to '2' for - /// analysis since there will be a current buffer stored in - /// shared mem and a next buffer stored in regs. - int numStages = 2; - - /// Arg indicies - size_t depArgsBeginIdx; - DenseMap depArgsIdx; - - /// value (in loop) => value at stage N - DenseMap> valueMapping; - /// loop iter arg => value - DenseMap depArgsMapping; - - /// forOp value => pplForOp value - IRMapping curMapping; - /// forOp value => prefetch value - IRMapping nextMapping; - - /// Dependency ops by program order - SmallVector orderedDeps; - - SetVector currentDeps; - - /// block arguments that loads depend on - SetVector depArgs; - - /// operation => source operand defined stages - DenseMap> immediateOpStages; - - /// operations that loads depend on - SetVector depOps; - - /// Collect values that `v` depends on and are defined inside the loop - void collectValueDep(Value v, int stage, SetVector &deps, - SetVector &args); +#define DEBUG_TYPE "tritonamdgpu-stream-pipeline" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") - /// Collect all op dependencies - void collectDeps(SetVector &ops, - MapVector> &opDeps); +#define int_attr(num) builder.getI64IntegerAttr(num) - void collectDepChain(Operation *op, SetVector &ops); - - /// Check if none of the for-ops has valid uses - LogicalResult checkOpUses(); - - /// Check if ops have dependencies that are not pipelinable - LogicalResult checkOpDeps(); - - void createBufferTypes(); - - void createOrderedDeps(); - - void createCurrentDeps(); - - /// Return the stage at which `v` is defined prior to `stage` - int getValueDefStage(Value v, int stage); - - /// Map `origin` to `newValue` at `stage` - void setValueMapping(Value origin, Value newValue, int stage); - - /// Map `origin` to `newValue` at `stage` according to the association between - /// yieldOp and forOp - void setValueMappingYield(Value origin, Value newValue, int stage); - - /// Map `origin` to `newValue` at the next stage according to the association - /// between yieldOp and forOp - void setValueMappingYield(Value origin, Value newValue); +using namespace mlir; +namespace tt = mlir::triton; +namespace ttg = mlir::triton::gpu; - /// Return the value mapped to `origin` at `stage`, if it exists. - Value lookupOrDefault(Value origin, int stage); +// TODO: We can extra some helpers into common utilities once we add more +// schedules. - Value getLoadMask(triton::LoadOp loadOp, Value mappedMask, Value loopCond, - OpBuilder &builder); - /// Collect all args of the new loop - SmallVector collectNewLoopArgs(); +namespace { - /// Clone the forOp and return the new forOp - scf::ForOp cloneForOp(ArrayRef newLoopArgs, OpBuilder &builder); +struct LoadInfo { + // Layout of the data in the shared memory. + ttg::SharedEncodingAttr sharedEncoding = nullptr; + // Blocked encoding is used for loads not used by the dot. + ttg::BlockedEncodingAttr blockedEncoding = nullptr; + int distToUse = 0; + bool usedByDot = false; +}; - void updateLoadMask(triton::LoadOp loadOp, Value newMask); - /// Prefetch the next iteration for `pplForOp` - void prefetchNextBuffer(OpBuilder &builder); - void cloneCurrentBody(OpBuilder &builder); - void storeNextBuffer(OpBuilder &builder); +} // namespace - bool isLoadChain(Operation *op) const; +// Replace the ForOp's yield with a new one with the given operands appended. +static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { + // Fix up the yield op. + Operation *yieldOp = forOp.getBody()->getTerminator(); + SmallVector operands(yieldOp->getOperands()); + operands.append(newOperands.begin(), newOperands.end()); - /// Assemble `pplForOp`'s yield op - void finalizeYield(OpBuilder &builder); + OpBuilder builder(yieldOp); + builder.create(yieldOp->getLoc(), operands); + yieldOp->erase(); +} -public: - LoopPipeliner(scf::ForOp forOp) : forOp(forOp) { - yieldOp = cast(forOp.getBody()->getTerminator()); +static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, + Value insertIdx, Value extractIdx, + tt::CoarseSchedule &schedule, + tt::CoarseSchedule::Cluster prefetchCluster, + llvm::MapVector &loadToInfo, + int numStages) { + OpBuilder builder(forOp); + Value zero = builder.create(forOp.getLoc(), 0, 32); + // Replace the load with insert/extract slice. + builder.setInsertionPoint(loadOp); + Location loc = loadOp.getLoc(); + Value src = loadOp.getPtr(); + Value mask = loadOp.getMask(); + Value other = loadOp.getOther(); + if (!isExpensiveLoadOrStore(loadOp) && loadToInfo[loadOp].blockedEncoding) { + // For inexpensive loads that do not directly feed into dot ops + // we want to use optimal layout for the data. + ttg::BlockedEncodingAttr encoding = loadToInfo[loadOp].blockedEncoding; + auto convertBlockLayout = [&](Value src) { + auto ty = cast(src.getType()); + auto newTy = + RankedTensorType::get(ty.getShape(), ty.getElementType(), encoding); + auto cvt = + builder.create(loadOp->getLoc(), newTy, src); + return cvt.getResult(); + }; + src = convertBlockLayout(src); + if (mask) + mask = convertBlockLayout(mask); + if (other) + other = convertBlockLayout(other); } - /// Collect loads to pipeline. Return success if we can pipeline this loop - LogicalResult initialize(); - - /// Emit pipelined loads (before loop body) - void emitPrologue(); - - /// emit pipelined loads (after loop body) - void emitEpilogue(DenseMap &newResults); - - /// create the new ForOp (add new args & insert prefetched ops) - scf::ForOp createNewForOp(); - - friend struct PipelinePass; -}; + tt::MemDescType allocTy = cast(alloc.getType()); + SmallVector copyOffsets(allocTy.getRank(), zero); + copyOffsets[0] = insertIdx; + Attribute sharedMemorySpace = + triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + tt::MemDescType subviewTy = tt::MemDescType::get( + allocTy.getShape().drop_front(), allocTy.getElementType(), + allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); + auto view = + builder.create(loc, subviewTy, alloc, copyOffsets); + Operation *copy = builder.clone(*loadOp); + + auto [stage, cluster] = schedule[loadOp]; + schedule.erase(loadOp); + schedule.insert(copy, stage, cluster); + + // Extract part. + SmallVector loadOffsets(allocTy.getRank(), zero); + loadOffsets[0] = extractIdx; + auto viewLoad = + builder.create(loc, subviewTy, alloc, loadOffsets); + Operation *lds_store = + builder.create(loc, copy->getResult(0), viewLoad); + { + SmallVector allocsToErase; + for (Operation *user : loadOp->getUsers()) { + if (auto alloc = dyn_cast(user)) { + alloc.replaceAllUsesWith(viewLoad.getResult()); + allocsToErase.push_back(alloc); + } + } + for (auto alloc : allocsToErase) { + alloc.erase(); + } -void LoopPipeliner::collectValueDep(Value v, int stage, - SetVector &deps, - SetVector &args) { - // Since we only need to peel the loop numStages-1 times, don't worry - // about depends that are too far away - if (stage < 0) - return; + auto sharedLoad = + builder.create(loc, loadOp.getType(), viewLoad); + auto result = sharedLoad->getResults(); + + // Create a select for non-zero other values as they are not handled by + // AsyncCopyGlobalToLocalOp for now. + Value other = loadOp.getOther(); + if (other && !isZeroConst(other)) { + auto select = builder.create( + loc, loadOp.getType(), mask, sharedLoad.getResult(), other); + result = select->getResults(); + } - // Loop-invariant value, skip - if (v.getParentRegion() != &forOp.getRegion()) - return; + loadOp->replaceAllUsesWith(result); - if (Operation *op = v.getDefiningOp()) { - if (!deps.contains(op)) { - deps.insert(op); - for (Value opr : op->getOperands()) - collectValueDep(opr, stage, deps, args); - } - } else if (auto arg = dyn_cast(v)) { - if (arg.getArgNumber() > 0) { - args.insert(arg); - collectValueDep(yieldOp->getOperand(arg.getArgNumber() - 1), stage - 1, - deps, args); + // Prefetch load if is used by the dot. + if (loadToInfo[loadOp].usedByDot) { + schedule.insert(lds_store, numStages - 2, prefetchCluster); + schedule.insert(viewLoad, numStages - 2, prefetchCluster); } } + loadOp.erase(); } -void LoopPipeliner::collectDeps( - SetVector &ops, - MapVector> &valueDeps) { - for (auto op : ops) { - for (Value v : op->getOperands()) { - SetVector deps; - SetVector args; - collectValueDep(v, numStages - 1, deps, args); - valueDeps[op] = deps; +// If all the transitive uses of the given value have are used by a convert to +// the same dot operand encoding, return true and get the shared encoding that +// needs to be used to be compatible with users' layouts. +static std::optional +getSharedEncIfAllUsersAreDotEnc(Value val) { + ttg::SharedEncodingAttr attr; + for (Operation *user : val.getUsers()) { + ttg::SharedEncodingAttr tempAttr; + if (user->getNumResults() != 1) + return std::nullopt; + if (auto memDesc = + dyn_cast(user->getResult(0).getType())) { + // First time we find a shared encoding in the chain, save it and try to + // use it if it is compatible with the other users. + tempAttr = cast(memDesc.getEncoding()); + if (!getSharedEncIfAllUsersAreDotEnc(user->getResult(0)).has_value()) + return std::nullopt; + } else { + if (!isa(user)) + return std::nullopt; + auto dotOpEnc = dyn_cast( + cast(user->getResult(0).getType()).getEncoding()); + if (!dotOpEnc) + return std::nullopt; + auto srcTy = cast(val.getType()); + auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); + auto order = ttg::getOrder(srcTy.getEncoding()); + unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); + tempAttr = ttg::SharedEncodingAttr::get( + val.getContext(), dotOpEnc, srcTy.getShape(), + ttg::getOrder(srcTy.getEncoding()), + ttg::getCTALayout(srcTy.getEncoding()), + srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false); } + // Check that the shared encodings needed by the users are compatible. + if (!tempAttr || (attr != nullptr && attr != tempAttr)) + return std::nullopt; + attr = tempAttr; } + return attr; } -LogicalResult LoopPipeliner::checkOpUses() { - SetVector ops; - // We cannot use forOp.walk(...) here because we only want to visit the - // operations in the loop body block. Nested blocks are handled separately. - for (Operation &op : forOp) { - if (auto loadOp = dyn_cast(&op)) - ops.insert(&op); - } +static ttg::BlockedEncodingAttr +getBlockedEncoding(tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfo) { + Value src = loadOp.getPtr(); + auto ty = cast(src.getType()); + auto mod = loadOp->getParentOfType(); + int numWarps = ttg::TritonGPUDialect::getNumWarps(mod); + int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod); + tt::AxisInfo::DimVectorT contiguity = + axisInfo.getAxisInfo(src)->getContiguity(); + SmallVector order = argSort(contiguity); + unsigned currPerThread = getNumElementsPerThread(loadOp, order, axisInfo); + SmallVector sizePerThread(order.size(), 1); + sizePerThread[order[0]] = currPerThread; + ttg::CTALayoutAttr ctaLayout = ttg::getCTALayout(ty.getEncoding()); + return ttg::BlockedEncodingAttr::get(loadOp->getContext(), ty.getShape(), + sizePerThread, order, numWarps, + threadsPerWarp, ctaLayout); +} - // Collect all ops' dependencies - MapVector> opDeps; - collectDeps(ops, opDeps); - - for (Operation *op : ops) { - auto loadOp = dyn_cast(op); - // Don't pipeline valid loads that depend on other valid loads - // (Because if a valid load depends on another valid load, this load needs - // to wait on the other load in the prologue, which is against the point - // of the pipeline pass) - bool isCandidate = true; - for (Operation *other : ops) - if (isa(other)) - if (opDeps[op].contains(other)) { - isCandidate = false; - break; +// Create a map from load ops to their indirection level and the +// final use of the load op (another load op, or a dot op). +// Indirection level is "0" for the load op directly used by the dot op, +// "1" for the load op used by the load op used by the dot op, and so on. +static llvm::SmallVector> +loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { + llvm::SmallVector> + loadOpToIndLevelAndUse; + DenseSet seen; + + std::function dfs = + [&](Operation *op, int distance, Operation *use) { + if (!seen.insert(op).second) + return; + if (isa(op)) { + // TODO: What if there are multiple uses at different distances? + loadOpToIndLevelAndUse.push_back(std::make_tuple(op, distance, use)); + use = op; + distance++; } - // We only pipeline loads that have one covert_layout (to dot_op) use - // TODO: lift this constraint in the future - if (isCandidate && loadOp.getResult().hasOneUse()) { - isCandidate = false; - Operation *use = *loadOp.getResult().getUsers().begin(); - - // Advance to the first conversion as long as the use resides in shared - // memory and it has a single use itself - while (use) { - if (use->getNumResults() != 1 || !use->getResult(0).hasOneUse()) - break; - auto tensorType = - dyn_cast(use->getResult(0).getType()); - if (!tensorType || - !isa(tensorType.getEncoding())) - break; - use = *use->getResult(0).getUsers().begin(); - } - - // TODO: handle fp_to_fp conversions in between - if (auto convertLayout = llvm::dyn_cast(use)) - if (auto tensorType = - dyn_cast(convertLayout.getResult().getType())) - if (auto dotOpEnc = dyn_cast( - tensorType.getEncoding())) { - isCandidate = true; - convertMapping[loadOp] = convertLayout; + for (Value operand : op->getOperands()) { + Value v = operand; + Operation *defOp = v.getDefiningOp(); + if (defOp && defOp->getBlock() == op->getBlock()) { + dfs(defOp, distance, use); } - } else - isCandidate = false; + } + }; - if (isCandidate) - validLoads.insert(op); + for (Operation &op : forOp.getBody()->without_terminator()) { + if (!op.hasTrait()) + continue; + seen.clear(); + dfs(&op, 0, &op); } - return validLoads.empty() ? failure() : success(); -} - -LogicalResult LoopPipeliner::checkOpDeps() { - /// arg => source operand defined stages - DenseMap> immediateArgStages; - SetVector nonImmediateDepArgs; - SetVector nonImmediateOps; - for (Operation *op : validLoads) { - for (Value v : op->getOperands()) { - SetVector deps; - SetVector args; - collectValueDep(v, numStages - 1, deps, args); - int defStage = getValueDefStage(v, numStages - 1); - if (defStage < 0) { - // assert(defStage >= 0 && - // "newLoopArgs has null args without a define op. Consider - // either " "rewrite the loop to reduce cross iteration - // dependencies or " "increase the num_stages value."); - return failure(); - } - bool immediate = args.size() > 0; - for (auto *dep : deps) { - depOps.insert(dep); - if (immediate) - immediateOpStages[dep].insert(defStage); - else - nonImmediateOps.insert(dep); - } - for (auto arg : args) { - depArgs.insert(arg); - if (immediate) - immediateArgStages[arg].insert(defStage); - else - nonImmediateDepArgs.insert(arg); - } + // If the loop has numStages attribute, also consider pipelining other loads + // that are not directly used by dot ops. + if (forOp->hasAttr(tt::kNumStagesAttrName)) { + for (Operation &op : forOp.getBody()->without_terminator()) { + if (!isa(op)) + dfs(&op, 0, &op); } } - // XXX: We could remove the following constraints if we can rematerialize in - // the loop. - // Check if immediateDepArgs and nonImmediateDepArgs are disjoint. - for (auto &[arg, stages] : immediateArgStages) { - assert(stages.size() == 1 && - "Triton doesn't support an argument provides values for " - "immediate operands of loads from multiple stages. Consider " - "removing post load instructions dependency on this argument."); - assert(!(nonImmediateDepArgs.contains(arg) && - stages.contains(numStages - 2)) && - "Loop-carried arguments provide values for both immediate and " - "non-immediate operands of loads. Please consider removing " - "pre/post load instructions dependency on this argument."); - } - - // Check if immediateOps and nonImmediateOps are disjoint. - for (auto &[op, stages] : immediateOpStages) { - assert(stages.size() == 1 && - "Triton doesn't support an operation provides values for " - "immediate operands of loads from multiple stages. Consider " - "removing post load instructions dependency on this argument."); - assert(!(nonImmediateOps.contains(op) && stages.contains(numStages - 2)) && - "Operations provide values for both immediate and " - "non-immediate operands of loads. Please consider " - "removing pre/post load instructions dependency on this " - "operation."); - } - return success(); + return loadOpToIndLevelAndUse; } -// helpers -void LoopPipeliner::setValueMapping(Value origin, Value newValue, int stage) { - if (valueMapping.find(origin) == valueMapping.end()) - valueMapping[origin] = SmallVector(numStages); - valueMapping[origin][stage] = newValue; -} +static llvm::MapVector +assignMemoryLayouts(llvm::SmallVector> + &loadOpToIndLevelAndUse, + tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) { + llvm::MapVector loadToInfo; + + for (auto &[op, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(op)) + // TODO pawel: err, we'd need to verify that the distance is the same + continue; + LoadInfo loadInfo; + + if (auto loadOp = dyn_cast(op)) { + assert(!isLoadFromTensorPtr(loadOp) && + "Block ptr should have been lowered before this pass."); + auto ptr = loadOp.getPtr(); + unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); + if (auto mask = loadOp.getMask()) + vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); + + auto tensorTy = dyn_cast(ptr.getType()); + if (!tensorTy) + continue; + auto ty = + cast(tensorTy.getElementType()).getPointeeType(); + unsigned width = vec * ty.getIntOrFloatBitWidth(); + + // We do not pipeline all loads for the following reasons: + // 1. On nvidia GPUs, cp.async's cp-size can only be 4, 8, or 16. + // 2. It's likely that pipling small loads won't offer much performance + // improvement and may even hurt performance by increasing register + // pressure. + LDBG("Load " << *loadOp << " has width " << width); + if (width < 32) + continue; + } -void LoopPipeliner::setValueMappingYield(Value origin, Value newValue, - int stage) { - for (OpOperand &operand : origin.getUses()) { - if (operand.getOwner() == yieldOp) { - auto yieldIdx = operand.getOperandNumber(); - auto value = forOp.getRegionIterArgs()[yieldIdx]; - setValueMapping(value, newValue, stage); + if (use->hasTrait()) { + loadInfo.usedByDot = true; + loadInfo.sharedEncoding = + getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr); + } else if (auto loadOp = dyn_cast(use)) { + // The use of this loadOp is another loadOp. If the use is not in the + // loadsToPipeline already, it means that the use is not valid for + // pipelining for some reason. We should skip this loadOp, too. Note that + // we have an assumption that distAndUse.second (i.e. the use of this + // loadOp) has already be processed in a previous loop iteration. This + // assumption is held by how loadOpsToIndirectionLevelAndUse recursively + // collects loadOpToIndLevelAndUse using DFS. + if (loadToInfo.count(loadOp) == 0) { + continue; + } } - } -} -void LoopPipeliner::setValueMappingYield(Value origin, Value newValue) { - for (OpOperand &operand : origin.getUses()) { - if (operand.getOwner() == yieldOp) { - auto yieldIdx = operand.getOperandNumber(); - auto depYieldIdx = depArgsIdx[forOp.getRegionIterArgs()[yieldIdx]]; - auto originArg = forOp.getRegionIterArgs()[yieldIdx]; - nextMapping.map(originArg, newValue); - auto newArg = pplForOp.getRegionIterArgs()[depYieldIdx]; - if (!depArgsMapping.contains(newArg)) - depArgsMapping[newArg] = newValue; + // If we still don't have a shared encoding, try a "generic" shared + // encoding. + if (!loadInfo.sharedEncoding) { + // Also pipeline in-register buffers. + if (auto loadOp = dyn_cast(op)) { + loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); + } } + + loadToInfo[op] = loadInfo; } -} -Value LoopPipeliner::lookupOrDefault(Value origin, int stage) { - if (valueMapping.find(origin) == valueMapping.end()) - return origin; - return valueMapping[origin][stage]; + return loadToInfo; } -void LoopPipeliner::createBufferTypes() { - for (auto loadCvt : convertMapping) { - auto loadOp = loadCvt.first; - Value cvt = loadCvt.second; - auto dotOpEnc = cast( - cast(cvt.getType()).getEncoding()); - auto ty = cast(loadOp.getType()); - SmallVector bufferShape(ty.getShape().begin(), - ty.getShape().end()); - Type eType = ty.getElementType(); - auto blockedEnc = cast(ty.getEncoding()); - auto CTALayout = ttg::getCTALayout(ty.getEncoding()); - // unsigned bitWidth = dotOpEnc.getMMAv2kWidth() - // ? 32 / dotOpEnc.getMMAv2kWidth() - // : ty.getElementType().getIntOrFloatBitWidth(); - auto sharedEnc = ttg::SharedEncodingAttr::get( - ty.getContext(), dotOpEnc, ty.getShape(), - ttg::getOrder(ty.getEncoding()), CTALayout, eType); - loadsBufferType[loadOp] = triton::MemDescType::get( - bufferShape, eType, sharedEnc, - triton::gpu::SharedMemorySpaceAttr::get(ty.getContext()), - /*mutableMemory=*/true); +static llvm::MapVector +scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, + DenseSet &rootUsers, int numStages) { + ModuleOp moduleOp = forOp->getParentOfType(); + tt::ModuleAxisInfoAnalysis axisInfoAnalysis(moduleOp); + + // Get all loads that are (transitively) used by dot ops and their distance + // to the dot op. + llvm::SmallVector> + loadOpToIndLevelAndUse = loadOpsToIndirectionLevelAndUse(forOp); + LLVM_DEBUG({ + LDBG("Found " << loadOpToIndLevelAndUse.size() << " loads to pipeline:"); + for (const auto &[l, i, u] : loadOpToIndLevelAndUse) { + LDBG(" - load: " << *l); + LDBG(" at indirection level: " << i); + LDBG(" used by op: " << *u); + } + }); + if (loadOpToIndLevelAndUse.empty()) + return {}; + + // Check which loads are good for pipelining, and assign them + // memory layouts. + llvm::MapVector loadToInfo = + assignMemoryLayouts(loadOpToIndLevelAndUse, axisInfoAnalysis); + + if (loadToInfo.empty()) + return {}; + + // Calculate the stage distance between applicable loads. + int maxIndirectionLevel = -1; + for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + maxIndirectionLevel = std::max(maxIndirectionLevel, dist); } -} - -void LoopPipeliner::createOrderedDeps() { - for (Operation &op : forOp.getBody()->without_terminator()) { - if (depOps.contains(&op)) - orderedDeps.push_back(&op); - else if (op.getNumResults() > 0 && validLoads.contains(&op)) - orderedDeps.push_back(&op); + unsigned stagesBetweenLoads = + ceil(numStages - 2, maxIndirectionLevel + 1); + + tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); + // Put the root uses of the loads in the last stage. + for (auto &[loadOp, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + // Non-LoadOp(s) are the root uses of all LoadOp(s) and should be + // always present in the opInfo + if (!isa(use)) { + schedule.insert(use, numStages - 1, rootUsersCluster); + rootUsers.insert(use); + } } - assert(depOps.size() + validLoads.size() == orderedDeps.size() && - "depOps contains invalid values"); -} -void LoopPipeliner::collectDepChain(Operation *op, - SetVector &ops) { - if (op->getNumResults() == 1 && validLoads.contains(op)) - return; - if (!ops.contains(op)) { - ops.insert(op); - for (Value opr : op->getOperands()) - if (Operation *oprOp = opr.getDefiningOp()) - collectDepChain(oprOp, ops); + SmallVector loadsClusters; + for (int i = 0; i < maxIndirectionLevel + 1; i++) { + loadsClusters.push_back(schedule.clusters.newAtBack()); } -} - -void LoopPipeliner::createCurrentDeps() { - for (Operation &op : forOp.getBody()->without_terminator()) { - if (!llvm::is_contained(orderedDeps, &op)) - collectDepChain(&op, currentDeps); + // Assign stages to the loads. + for (auto [loadOp, indLevel, _] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + int stage = (maxIndirectionLevel - indLevel) * stagesBetweenLoads; + schedule.insert(loadOp, stage, loadsClusters[indLevel]); } -} - -int LoopPipeliner::getValueDefStage(Value v, int stage) { - if (stage < 0) - return -1; - if (auto arg = dyn_cast(v)) { - if (arg.getArgNumber() > 0) - return getValueDefStage(yieldOp->getOperand(arg.getArgNumber() - 1), - stage - 1); - llvm_unreachable("Loop induction variable should not be a dependency"); - } else - return stage; -} - -LogicalResult LoopPipeliner::initialize() { - if (checkOpUses().failed()) - return failure(); - - if (checkOpDeps().failed()) - return failure(); - createBufferTypes(); - - createOrderedDeps(); - - createCurrentDeps(); + // Distance from the load to the use. + for (auto [loadOp, _, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + loadToInfo[loadOp].distToUse = schedule[use].first - schedule[loadOp].first; + } - return success(); + return loadToInfo; } -Value LoopPipeliner::getLoadMask(triton::LoadOp loadOp, Value mappedMask, - Value loopCond, OpBuilder &builder) { - if (!peelLastIter) { - // add mask for last iteration when not peeled to epilogue - Value mask = loadOp.getMask(); - Type maskType = triton::getI1SameShape(loadOp.getType()); - Value newMask; - if (mask) { - Value cond = loopCond; - if (isa(maskType)) { - cond = - builder.create(mask.getLoc(), maskType, loopCond); - } - newMask = builder.create(mask.getLoc(), mappedMask, cond); - } else { - if (isa(maskType)) { - newMask = builder.create(loopCond.getLoc(), maskType, - loopCond); - } else { - newMask = loopCond; +// Schedule the prologue and epilogue `if` ops in the loop, pushing them as +// close to the loop boundaries as possible. Return the cluster after the +// prologue (or the beginning of the loop if there is no prologue). +static tt::CoarseSchedule::Cluster +schedulePrologueAndEpilogue(scf::ForOp forOp, tt::CoarseSchedule &schedule, + DenseSet &rootUsers, int numStages) { + tt::CoarseSchedule::Cluster afterPrologue = schedule.clusters.begin(); + + // Look for the IfOp that is in the backward slice any of the currently + // scheduled ops and put it at the beginning of the loop. + DenseMap ifsToStage; + // Go stage by stage. + for (int stage = 0; stage < numStages; stage++) { + for (auto [op, stage_, cluster] : schedule.getOpsInOrder(forOp)) { + if (stage_ != stage) + continue; + SetVector backwardSlice; + BackwardSliceOptions opt; + opt.omitBlockArguments = true; + getBackwardSlice((Operation *)op, &backwardSlice, opt); + + for (auto op : backwardSlice) { + if (auto ifOp = dyn_cast(op)) { + ifsToStage.insert({ifOp, stage}); + } } } - return newMask; } - // use original mask when peeling last iteration bc the loop will not do - // extra loads for the tail of the pipeline - return mappedMask; -} + tt::CoarseSchedule::Cluster prologueCluster = schedule.clusters.newAtFront(); + for (auto [ifOp, stage] : ifsToStage) { + schedule.insert(ifOp, stage, prologueCluster); + } -bool LoopPipeliner::isLoadChain(Operation *op) const { - if (auto cvtOp = dyn_cast(op)) { - Value loadVal = cvtOp.getSrc(); - if (auto f2fOp = dyn_cast(op)) - loadVal = f2fOp.getSrc(); - if (validLoads.contains(loadVal.getDefiningOp())) { - if (isa(cvtOp.getType().getEncoding())) - return true; + // Look for the IfOp that is in the forward slice of the root users and put it + // at the end of the loop. + tt::CoarseSchedule::Cluster epilogueCluster = schedule.clusters.newAtBack(); + for (auto rootUser : rootUsers) { + SetVector forwardSlice; + getForwardSlice(rootUser, &forwardSlice); + + int stage = schedule[rootUser].first; + for (auto op : forwardSlice) { + scf::IfOp ifOp = dyn_cast(op); + if (ifOp == nullptr) { + // check if the op is in the body of an if op that's part of the loop + auto parentOp = op->getParentOp(); + if (parentOp != nullptr && + parentOp->getParentOp() == forOp.getOperation()) { + ifOp = dyn_cast(parentOp); + } + } + if (ifOp) { + schedule.insertIfAbsent(ifOp, stage, + epilogueCluster); // after prefetch extracts + } } } - return false; + return afterPrologue; } -void LoopPipeliner::emitPrologue() { - /// forOp block args => forOp operands - /// forOp iterator => lower bound - IRMapping prologueMap; - OpBuilder builder(forOp); - // Get init operands for loop carried values - for (BlockArgument &arg : forOp.getRegionIterArgs()) { - OpOperand &operand = *forOp.getTiedLoopInit(arg); - prologueMap.map(arg, operand.get()); - } - - // Emit prologue - // Map IV to lower bound - prologueMap.map(forOp.getInductionVar(), forOp.getLowerBound()); - - // Emit Iteration 0 loads, etc - for (Operation *op : orderedDeps) { - Operation *newOp = nullptr; - if (validLoads.contains(op)) { - auto loadOp = cast(op); - // Load from global -> regs - auto newLoadOp = cloneWithInferType(builder, op, prologueMap); - Value loadVal = newLoadOp->getResult(0); - // Convert from regs to shared mem - newOp = builder.create( - loadOp.getLoc(), loadsBufferType[loadOp], loadVal); - Value cvtVal = newOp->getResult(0); - prologueMap.map(loadOp->getResult(0), cvtVal); - loadsBuffer[op] = cvtVal; - } else { - newOp = cloneWithInferType(builder, op, prologueMap); +// Add dependencies of anchor ops to the coarse schedule. Schedule them to +// the same stage and ordering cluster as the anchor op. +static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, + int numStages) { + SmallVector> + opsInOrder = schedule.getOpsInOrder(forOp); + // Schedule dependencies stage by stage. + for (int stage = 0; stage < numStages; stage++) { + for (auto [op, stage_, cluster] : opsInOrder) { + if (stage_ != stage) + continue; + schedule.insertDepsOfOp(op, stage, cluster, false); } - // Capture loop carried results for pipelined for input - for (unsigned idx : llvm::seq(unsigned(0), op->getNumResults())) - setValueMappingYield(op->getResult(idx), newOp->getResult(idx), 1); - } // for (Operation *op : orderedDeps) + } } -void LoopPipeliner::emitEpilogue(DenseMap &newResults) { - if (!peelLastIter) - return; - OpBuilder builder(pplForOp); - builder.setInsertionPointAfter(pplForOp); - - IRMapping epilogueMap; - // Map 'for' iteration args to pipelined-for results - auto args = forOp.getRegionIterArgs(); - for (uint32_t i = 0; i < args.size(); ++i) - epilogueMap.map(args[i], pplForOp.getResult(i)); - for (auto *loadOp : validLoads) - epilogueMap.map(loadOp->getResult(0), loadsBuffer[loadOp]); - - // This is computing the upper bound of the pipelined loop as: - // pplUpperBound = lb+((ub-1-lb)/step)*step - Location loc = forOp.getLoc(); - Value ub = forOp.getUpperBound(); - Value lb = forOp.getLowerBound(); - Value step = forOp.getStep(); - Value one = builder.create(loc, 1, 32); - - // pplRange = ub-1-lb - Value pplRange = builder.create( - loc, builder.create(loc, ub, one), lb); - - // pplIters = (pplrRange/step)*step - Value pplIters = builder.create( - loc, builder.create(loc, pplRange, step), step); - - // pplUpperBound = lb+pplIters - Value pplUpperBound = builder.create(loc, lb, pplIters); - epilogueMap.map(forOp.getInductionVar(), pplUpperBound); - - const auto &yieldOprs = yieldOp.getOperands(); - // Clone the loop body after the new ForOp - // , replace original args with results of the new ForOp. - for (Operation &op : forOp.getBody()->without_terminator()) { - if (currentDeps.contains(&op)) { - Operation *newOp = nullptr; - if (isLoadChain(&op)) { - if (auto cvt = dyn_cast(&op)) { - Value mappedValue = epilogueMap.lookup(cvt.getSrc()); - if (isa(mappedValue.getType())) { - auto newCvt = builder.create( - cvt.getLoc(), cvt.getType(), mappedValue); - epilogueMap.map(cvt.getResult(), newCvt); - newOp = newCvt; - } - } - if (!newOp) - newOp = builder.clone(op, epilogueMap); - } else { - newOp = cloneWithInferType(builder, &op, epilogueMap); +// Find dependencies with distance of 1. They will go to the next stage, +// but in the cluster before the current op. +static void scheduleDistanceOneDependencies(scf::ForOp forOp, + tt::CoarseSchedule &schedule, + int numStages) { + auto getNestedOperands = [](Operation *op) -> SmallVector { + SmallVector operands; + op->walk([&](Operation *nestedOp) { + for (Value operand : nestedOp->getOperands()) { + if (operand.getParentBlock()->getParentOp()->isAncestor(nestedOp)) + operands.push_back(operand); } - // substitute for these results for the results of the new for loop - for (const auto &pair : llvm::zip(op.getResults(), newOp->getResults())) { - auto val = std::get<0>(pair); - auto it = llvm::find(yieldOprs, val); - if (it != yieldOprs.end()) { - uint32_t idx = std::distance(yieldOprs.begin(), it); - newResults[forOp->getResult(idx)] = std::get<1>(pair); + }); + return operands; + }; + + // Mapping from the cluster to the cluster before it. + DenseMap + dist1Cluster; + for (auto &op : forOp.getBody()->without_terminator()) { + if (schedule.count(&op) == 0) + continue; + auto [stage, cluster] = schedule[&op]; + // Can't schedule past the last stage. + if (stage == numStages - 1) + continue; + for (Value operand : getNestedOperands(&op)) { + if (auto arg = dyn_cast(operand)) { + if (arg.getArgNumber() > 0 && arg.getOwner() == op.getBlock()) { + auto yieldOp = op.getBlock()->getTerminator(); + Value v = yieldOp->getOperand(arg.getArgNumber() - 1); + Operation *defOp = v.getDefiningOp(); + if (defOp && schedule.count(defOp) == 0) { + if (isa(defOp)) { + // Exception: Schedule loads with a distance of 1 together + // with the current op. + schedule.insertIfAbsent(defOp, stage, cluster); + schedule.insertDepsOfOp(defOp, stage, cluster, true); + } else { + if (dist1Cluster.count(&cluster) == 0) { + dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster); + } + schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]); + schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster], + true); + } + } } } } } } -SmallVector LoopPipeliner::collectNewLoopArgs() { - // Order of new args: - // (original args) - // (shared mem buffers for each load) - // (depArgs at stage numStages - 1) - - // We need this to update operands for yield - // original block arg => new arg's idx - SmallVector newLoopArgs; - for (auto v : forOp.getInitArgs()) { - newLoopArgs.push_back(lookupOrDefault(v, numStages - 1)); /*1*/ +static void +scheduleRemainingToLastStage(scf::ForOp forOp, tt::CoarseSchedule &schedule, + tt::CoarseSchedule::Cluster afterPrologue, + int numStages) { + // Assign the rest of the ops to the last stage. + // Take care of the ordering of the ops - uses cannot be scheduled to the + // cluster before the definition. + DenseMap opToCluster; + for (auto &op : forOp.getBody()->without_terminator()) { + if (schedule.count(&op) == 0) { + opToCluster[&op] = afterPrologue; + } } - - // Loop carried vals - depArgsBeginIdx = newLoopArgs.size(); - for (auto depArg : depArgs) { - depArgsIdx[depArg] = newLoopArgs.size(); - newLoopArgs.push_back(valueMapping[depArg][numStages - 1]); /*1*/ + SmallVector queue; + for (auto [op, stage, cluster] : schedule.getOpsInOrder(forOp)) { + // We really only care about the producers from the last stage. + // Others will be scheduled before these ops anyway. + if (stage == numStages - 1) { + queue.push_back(op); + } } - - return newLoopArgs; -} - -scf::ForOp LoopPipeliner::cloneForOp(ArrayRef newLoopArgs, - OpBuilder &builder) { - auto loc = forOp.getLoc(); - // Peel off the last iteration - auto pplUpperBound = forOp.getUpperBound(); - if (peelLastIter) - pplUpperBound = - builder.create(loc, pplUpperBound, forOp.getStep()); - - // Clone the original ForOp - pplForOp = builder.create( - loc, forOp.getLowerBound(), pplUpperBound, forOp.getStep(), newLoopArgs); - - // Set mapping on body of the new ForOp - builder.setInsertionPointToStart(pplForOp.getBody()); - for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) - curMapping.map(arg.value(), pplForOp.getRegionIterArgs()[arg.index()]); - for (auto *loadOp : validLoads) - curMapping.map(loadOp->getResult(0), loadsBuffer[loadOp]); - curMapping.map(forOp.getInductionVar(), pplForOp.getInductionVar()); - - nextMapping = curMapping; - // Map the dep args of the next iteration to the dep args of the current - auto iterArgs = pplForOp.getRegionIterArgs(); - size_t argIdx = 0; - for (auto depArg : depArgs) { - BlockArgument nextArg = iterArgs[argIdx + depArgsBeginIdx]; - nextMapping.map(depArg, nextArg); - ++argIdx; + while (!queue.empty()) { + Operation *op = queue.pop_back_val(); + for (auto user : op->getUsers()) { + if (opToCluster.count(user)) { + tt::CoarseSchedule::Cluster userCluster = opToCluster[user]; + tt::CoarseSchedule::Cluster opCluster = schedule[op].second; + if (*userCluster < *opCluster) { + opToCluster[user] = opCluster; + queue.push_back(user); + } + } + } + } + for (auto [op, cluster] : opToCluster) { + schedule.insert(op, numStages - 1, cluster); } +} - // Compute next IV for pre-loads - Value iv = pplForOp.getInductionVar(); - curMapping.map(forOp.getInductionVar(), iv); - Value nextIV = - builder.create(iv.getLoc(), iv, pplForOp.getStep()); - nextMapping.map(forOp.getInductionVar(), nextIV); - nextLoopCond = - builder.create(nextIV.getLoc(), arith::CmpIPredicate::slt, - nextIV, pplForOp.getUpperBound()); - - return pplForOp; +// Create an allocation that can hold distance number of loadOp shapes. +static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, + ttg::SharedEncodingAttr sharedEnc, unsigned distance) { + OpBuilder builder(forOp); + Attribute sharedMemorySpace = + triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + auto ty = cast(loadOp->getResultTypes()[0]); + SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); + bufferShape.insert(bufferShape.begin(), distance); + Type memdescType = mlir::triton::MemDescType::get( + bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace, + /*mutableMemory*/ true); + Value alloc = builder.create( + loadOp->getLoc(), memdescType, Value()); + return alloc; } -void LoopPipeliner::updateLoadMask(triton::LoadOp loadOp, Value newMask) { - if (newMask) { - if (loadOp->getNumOperands() > 1) - loadOp->setOperand(1, newMask); - else { - auto mask = loadOp.getMaskMutable(); - mask.assign(newMask); +// Convert load ops into their asyn version and apply multi-buffering based on +// the required number of buffers. +static SmallVector +createAsyncOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, + llvm::MapVector &loadToInfo, + int numStages) { + // Calculate the number of buffers needed for each load. + // TODO pawel: we could do more fine-grained allocation here and + // allocate only the number of buffers that specific loads need. + // Instead, we allocate the maximum number of buffers needed by any load. + int numBuffers = + llvm::max_element(llvm::make_second_range(loadToInfo), [](auto &lhs, + auto &rhs) { + return lhs.distToUse < rhs.distToUse; + })->distToUse; + + SmallVector> asyncLoads; + SmallVector allocs; + for (auto &[loadOp, info] : loadToInfo) { + // assert(info.sharedEncoding && "LoadOp shared encoding not defined."); + if (info.sharedEncoding) { + Value alloc = createAlloc(forOp, loadOp, info.sharedEncoding, numBuffers); + assert(alloc && "Failed to create alloc for the async load."); + allocs.push_back(alloc); + asyncLoads.emplace_back(loadOp, alloc); } } -} -void LoopPipeliner::prefetchNextBuffer(OpBuilder &builder) { - // Emit prefetch loads of next buffer before compute of current buffer - for (Operation *op : orderedDeps) { - Operation *nextOp = nullptr; - if (validLoads.contains(op)) { - // Update loading mask - auto loadOp = llvm::cast(op); - auto mask = loadOp.getMask(); - // pre-load global -> regs - Value newMask = getLoadMask(loadOp, nextMapping.lookupOrDefault(mask), - nextLoopCond, builder); - if (mask) { - // If mask is defined outside the loop, don't update the map more than - // once - if (!(forOp.isDefinedOutsideOfLoop(mask) && nextMapping.contains(mask))) - nextMapping.map(loadOp.getMask(), newMask); - newMask = nextMapping.lookupOrDefault(mask); - } - auto newOp = builder.clone(*op, nextMapping); - updateLoadMask(cast(newOp), newMask); - } else if (!immediateOpStages[op].contains(numStages - 2)) { - Operation *nextOp = builder.clone(*op, nextMapping); - if (auto loadOp = dyn_cast(op)) { - if (auto newMask = getLoadMask( - loadOp, nextMapping.lookupOrDefault(loadOp.getMask()), - nextLoopCond, builder)) { - updateLoadMask(cast(nextOp), newMask); - } - } + IRRewriter builder(forOp.getContext()); + builder.setInsertionPoint(forOp); - for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) - nextMapping.map(op->getResult(dstIdx), nextOp->getResult(dstIdx)); - for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) - setValueMappingYield(op->getResult(dstIdx), nextOp->getResult(dstIdx)); - } + Location loc = forOp.getLoc(); + // Create two new counters to index into the allocs. + Value minusOne = builder.create(loc, -1, 32); + Value zero = builder.create(loc, 0, 32); + Value one = builder.create(loc, 1, 32); + Value insertIdx = minusOne; + Value extractIdx = minusOne; + Value phase = Value(); + Value numBuffersVal = + builder.create(loc, numBuffers, 32); + SmallVector newOperands; + newOperands.push_back(insertIdx); + newOperands.push_back(extractIdx); + + unsigned newOperandIndex = forOp.getBody()->getNumArguments(); + // Patch the loop to add the new loop carried dependencies. + scf::ForOp newForOp = + replaceForOpWithNewSignature(builder, forOp, newOperands); + forOp.erase(); + forOp = newForOp; + insertIdx = newForOp.getBody()->getArgument(newOperandIndex); + extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); + if (phase) { + phase = newForOp.getBody()->getArgument(newOperandIndex + 2); } -} -void LoopPipeliner::cloneCurrentBody(OpBuilder &builder) { - auto loc = forOp.getLoc(); - // only add instructions that are not part of the restructuring - for (Operation &op : forOp.getBody()->without_terminator()) { - if (currentDeps.contains(&op)) { - Operation *newOp = nullptr; - if (isLoadChain(&op)) { - if (auto cvt = dyn_cast(&op)) { - Value mappedValue = curMapping.lookup(cvt.getSrc()); - if (isa(mappedValue.getType())) { - auto newCvt = builder.create( - cvt.getLoc(), cvt.getType(), mappedValue); - curMapping.map(cvt.getResult(), newCvt); - newOp = newCvt; - } - } - if (!newOp) - newOp = builder.clone(op, curMapping); - } else { - newOp = cloneWithInferType(builder, &op, curMapping); - } - } + // Create two counters for the insert and extract indices to avoid creating + // long liverange. + builder.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); + insertIdx = builder.create(loc, insertIdx, one); + Value cndIns = builder.create(loc, arith::CmpIPredicate::slt, + insertIdx, numBuffersVal); + insertIdx = builder.create(loc, cndIns, insertIdx, zero); + + extractIdx = builder.create(loc, extractIdx, one); + Value cndExt = builder.create(loc, arith::CmpIPredicate::slt, + extractIdx, numBuffersVal); + extractIdx = builder.create(loc, cndExt, extractIdx, zero); + if (phase) { + Value nextPhase = builder.create(loc, phase, one); + phase = builder.create(loc, cndExt, phase, nextPhase); } -} -void LoopPipeliner::storeNextBuffer(OpBuilder &builder) { - // Store the next buffer at the end of the loop body for the next iteration - for (Operation *op : orderedDeps) { - if (!validLoads.contains(op)) { - if (immediateOpStages[op].contains(numStages - 2)) { - Operation *nextOp = builder.clone(*op, nextMapping); - if (auto loadOp = dyn_cast(op)) { - auto newMask = - getLoadMask(loadOp, nextMapping.lookupOrDefault(loadOp.getMask()), - nextLoopCond, builder); - updateLoadMask(cast(nextOp), newMask); - } + // Create a cluster for the prefetches. It may end up being empty, but this + // is OK. + tt::CoarseSchedule::Cluster prefetchCluster = schedule.clusters.newAtBack(); - for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) - setValueMappingYield(op->getResult(dstIdx), - nextOp->getResult(dstIdx)); - } + for (auto &pair : asyncLoads) { + if (auto loadOp = dyn_cast(pair.first)) { + createAsyncCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, + schedule, prefetchCluster, loadToInfo, numStages); } } + SmallVector newYieldOperands = {insertIdx, extractIdx}; + if (phase) + newYieldOperands.push_back(phase); + // Patch the yield with the updated counters. + appendToYield(forOp, newYieldOperands); - // PL loads -> store next to shared - for (auto *loadOp : validLoads) { - Value loadVal = nextMapping.lookup(loadOp->getResult(0)); - // then store regs -> shared - Value storeBuf = loadsBuffer[loadOp]; - builder.create(loadOp->getLoc(), loadVal, storeBuf); - } + return allocs; +} - // Some values have not been used by any ops in the loop body - for (BlockArgument arg : forOp.getRegionIterArgs()) - setValueMappingYield(arg, pplForOp.getRegionIterArgs()[depArgsIdx[arg]]); +static bool +preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, + mlir::triton::PipeliningOption &options) { + // Schedule the loads and root ops (dot ops) in the loop. This will give us + // a scaffold for the final schedule. + DenseSet rootUsers; + tt::CoarseSchedule coarseSchedule(numStages); + llvm::MapVector loadToInfo = + scheduleLoads(forOp, coarseSchedule, rootUsers, numStages); + if (loadToInfo.empty()) + return false; + + LLVM_DEBUG({ + LDBG("Coarse schedule loads only:"); + coarseSchedule.dump(); + }); + + // Convert the loads into async loads and create the allocs. + SmallVector allocs = + createAsyncOps(forOp, coarseSchedule, loadToInfo, numStages); + + LLVM_DEBUG({ + LDBG("Coarse schedule with async loads:"); + coarseSchedule.dump(); + }); + + tt::CoarseSchedule::Cluster afterPrologue = + schedulePrologueAndEpilogue(forOp, coarseSchedule, rootUsers, numStages); + LLVM_DEBUG({ + LDBG("Coarse schedule with prologue and epilogue:"); + coarseSchedule.dump(); + }); + + scheduleDependencies(forOp, coarseSchedule, numStages); + LLVM_DEBUG({ + LDBG("Coarse schedule with dependencies:"); + coarseSchedule.dump(); + }); + + scheduleDistanceOneDependencies(forOp, coarseSchedule, numStages); + LLVM_DEBUG({ + LDBG("Coarse schedule with dist 1:"); + coarseSchedule.dump(); + }); + + scheduleRemainingToLastStage(forOp, coarseSchedule, afterPrologue, numStages); + LLVM_DEBUG({ + LDBG("Final coarse schedule:"); + coarseSchedule.dump(); + }); + + // Create the final schedule for the kernel loop. This will dictate the + // stages and order of operations to the pipeline expander. + std::vector> schedule = + coarseSchedule.createFinalSchedule(forOp); + + // Fill out the pipeline options. + options.getScheduleFn = + [schedule](scf::ForOp forOp, + std::vector> &s) { + s = std::move(schedule); + }; + options.peelEpilogue = false; + options.predicateFn = tt::predicateOp; + options.supportDynamicLoops = true; + options.annotateFn = [](Operation *op, + mlir::triton::PipeliningOption::PipelinerPart part, + unsigned iteration) {}; + // Insert a wait 0 after the loop + OpBuilder builder(forOp); + builder.setInsertionPointAfter(forOp); + // Explicitly deallocate allocated tensors after the wait op + for (auto alloc : allocs) + builder.create(forOp.getLoc(), alloc); + return true; } -void LoopPipeliner::finalizeYield(OpBuilder &builder) { - SmallVector yieldValues; - for (const auto &opr : llvm::enumerate(yieldOp->getOperands())) { - if (curMapping.contains(opr.value())) - yieldValues.push_back(curMapping.lookup(opr.value())); - else - yieldValues.push_back(pplForOp.getRegionIterArgs()[opr.index()]); - } - for (size_t i = 0; i < depArgsMapping.size(); ++i) { - auto arg = pplForOp.getRegionIterArgs()[depArgsBeginIdx + i]; - assert(depArgsMapping.count(arg) && "Missing loop-carried value"); - yieldValues.push_back(depArgsMapping[arg]); - } +// Return true if the preconditions for pipelining the loop are met. +static bool preCondition(scf::ForOp forOp) { + // Skip loop with distance > 1 for now. + // TODO: relax the constraint in the expander. + if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), + [](Value operand) { + Operation *def = operand.getDefiningOp(); + return !def; + })) + return false; + // Don't pipeline outer loops. + if (forOp + ->walk([&](Operation *op) { + if (forOp.getOperation() == op) + return WalkResult::advance(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted()) + return false; + return true; +} - builder.setInsertionPointToEnd(pplForOp.getBody()); - builder.create(yieldOp->getLoc(), yieldValues); +static void tryAndPipelineOuterLoop(scf::ForOp forOp) { + mlir::triton::PipeliningOption options; + bool foundSchedule = false; + // Limit 2 stages to not require extra shared memory. + foundSchedule = getOuterLoopSchedule(forOp, /*numStage=*/2, options); + if (!foundSchedule) + return; + IRRewriter rewriter(forOp->getContext()); + rewriter.setInsertionPoint(forOp); + FailureOr newForOp = + mlir::triton::pipelineForLoop(rewriter, forOp, options); } -scf::ForOp LoopPipeliner::createNewForOp() { - OpBuilder builder(forOp); - auto newLoopArgs = collectNewLoopArgs(); - cloneForOp(newLoopArgs, builder); - prefetchNextBuffer(builder); - cloneCurrentBody(builder); - storeNextBuffer(builder); - finalizeYield(builder); - return pplForOp; +static bool pipelineLoop(scf::ForOp forOp, int numStages) { + mlir::triton::PipeliningOption options; + if (!preCondition(forOp)) + return false; + + bool foundSchedule = false; + foundSchedule = preProcessLoopAndGetSchedule2(forOp, numStages, options); + + // TODO: add more pipelines strategy. + if (!foundSchedule) + return false; + + IRRewriter rewriter(forOp->getContext()); + rewriter.setInsertionPoint(forOp); + FailureOr newForOp = + mlir::triton::pipelineForLoop(rewriter, forOp, options); + + if (failed(newForOp)) + return false; + return true; } -// Stream Pipeline +namespace { struct PipelinePass : public TritonAMDGPUStreamPipelineBase { PipelinePass() = default; + PipelinePass(int32_t numStages) { this->numStages = numStages; } + + int getNumStagesOrDefault(scf::ForOp forOp) { + // Use the attribute attached to the loop if it exists otherwise use the + // global control. + if (auto attr = + forOp->getAttrOfType(mlir::triton::kNumStagesAttrName)) + return attr.getInt(); + return numStages; + } void runOnOperation() override { - // Pre-processing - // we make sure element-wise ops are done *after* the conversion - // to dot operands - // we can achieve this with simple recursive pattern matching - // MLIRContext *context = &getContext(); - // mlir::RewritePatternSet patterns(context); - // patterns.add(context); - // auto didPreprocess = - // applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); - - // Do the pipelining - getOperation()->walk([&](scf::ForOp forOp) -> void { - LoopPipeliner pipeliner(forOp); - - if (pipeliner.initialize().failed()) - return; - - pipeliner.emitPrologue(); - scf::ForOp pplForOp = pipeliner.createNewForOp(); - DenseMap newResults; - for (unsigned i = 0; i < forOp->getNumResults(); ++i) - newResults[forOp->getResult(i)] = pplForOp->getResult(i); - pipeliner.emitEpilogue(newResults); - - // Replace the original loop - for (auto &pair : newResults) - std::get<0>(pair).replaceAllUsesWith(std::get<1>(pair)); - forOp->erase(); + SmallVector loops; + getOperation()->walk([&](scf::ForOp forOp) { + // Bail out for loops with num_stage <= 1. + if (getNumStagesOrDefault(forOp) > 1) + loops.push_back(forOp); }); + + if (loops.empty()) + return; + + llvm::SmallSetVector outerLoops; + for (scf::ForOp forOp : loops) { + auto outerLoop = dyn_cast(forOp->getParentOp()); + int loopNumStages = getNumStagesOrDefault(forOp); + bool pipelined = pipelineLoop(forOp, loopNumStages); + if (pipelined && outerLoop && getNumStagesOrDefault(outerLoop) > 1) + outerLoops.insert(outerLoop); + } + + // Clean up arithmetic before applying the next level of pipelining to + // simplify the IR. + auto arithDialect = + getOperation().getContext()->getLoadedDialect(); + RewritePatternSet patterns(getOperation().getContext()); + arithDialect->getCanonicalizationPatterns(patterns); + if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) + .failed()) + return signalPassFailure(); + + // Try to pipeline the outer loop to overlap the prologue and epilogue of + // the inner loop. + for (scf::ForOp outerLoop : outerLoops) + tryAndPipelineOuterLoop(outerLoop); } }; } // anonymous namespace -std::unique_ptr mlir::createTritonAMDGPUStreamPipelinePass() { - return std::make_unique(); +std::unique_ptr +mlir::createTritonAMDGPUStreamPipelinePass(int numStages) { + return std::make_unique(numStages); } diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index f6606fe8b24d..c91a2992e7b2 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -55,8 +55,8 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) { mlir::createTritonAMDGPUOptimizeEpiloguePass); ADD_PASS_WRAPPER_0("add_reorder_instructions", mlir::createTritonAMDGPUReorderInstructionsPass); - ADD_PASS_WRAPPER_0("add_stream_pipeline", - mlir::createTritonAMDGPUStreamPipelinePass); + ADD_PASS_WRAPPER_1("add_stream_pipeline", + mlir::createTritonAMDGPUStreamPipelinePass, int); } void addControlConstant(llvm::Module *module, const char *name, From f06e622bcd266dabdb6e5f37c8e7a0ed6df16e3a Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 17 Jun 2024 14:58:22 +0000 Subject: [PATCH 02/36] [AMD-Reorder] Move `tt.load`s as early as possible - Also move independent(from loop-carried buffer) `triton_gpu.local_store` as early as possible --- .../amd/amd-reorder-instructions.mlir | 2314 +++++++++++++++++ .../ReorderInstructions.cpp | 96 +- 2 files changed, 2393 insertions(+), 17 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index cb565d1f054d..3b332c8a4148 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -23,3 +23,2317 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war tt.return } } + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +#shared3 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +#shared4 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80"} { + +// CHECK-LABEL: tt.func @matmul_loop +// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}) +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[ADDPTR_23:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[LOAD_24:.*]] = tt.load %[[ADDPTR_23]], %[[SPLAT_22]] +// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// CHECK: %[[LOAD_27:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_25]], %{{.*}} +// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} +// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[MULF_33:.*]] = arith.mulf %[[LOCAL_LOAD_32]], %{{.*}} +// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_31]], %[[MULF_33]], %[[ARG8]] +// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} +// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_24]], %[[MEMDESC_SUBVIEW_38]] +// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: scf.yield %[[ADDPTR_23]], %[[ADDPTR_26]], %[[DOT_34]], %[[SELECT_30]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: } + + tt.func @matmul_loop(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { + %c1_i32 = arith.constant 1 : i32 + %0 = arith.cmpi slt, %arg0, %arg1 : index + %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> + %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> + %4 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> + %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> + %5 = tt.splat %0 : i1 -> tensor<32x128xi1, #blocked> + %6 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %7 = tt.load %6, %5, %cst : tensor<32x128x!tt.ptr, #blocked> + %8 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %9 = tt.expand_dims %8 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %10 = tt.broadcast %9 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> + %11 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> + %12 = tt.splat %0 : i1 -> tensor<128x32xi1, #blocked1> + %13 = tt.addptr %11, %10 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> + %14 = tt.load %13, %12 : tensor<128x32x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst_0 = arith.constant dense<4.000000e+00> : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %cst_1 = arith.constant dense<4> : tensor<32x128xi32, #blocked> + %cst_2 = arith.constant dense<4> : tensor<128x32xi32, #blocked1> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> + %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %7, %18 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %19:7 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %13, %arg7 = %6, %arg8 = %cst_3, %arg9 = %c-1_i32, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + %20 = arith.subi %arg1, %arg2 : index + %21 = arith.cmpi slt, %arg5, %20 : index + %22 = tt.splat %21 : i1 -> tensor<32x128xi1, #blocked> + %23 = tt.addptr %arg7, %cst_1 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %24 = tt.load %23, %22, %cst : tensor<32x128x!tt.ptr, #blocked> + %25 = tt.splat %21 : i1 -> tensor<128x32xi1, #blocked1> + %26 = tt.addptr %arg6, %cst_2 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> + %27 = tt.load %26, %25 : tensor<128x32x!tt.ptr, #blocked1> + %28 = arith.addi %arg9, %c1_i32 : i32 + %29 = arith.cmpi slt, %28, %c1_i32 : i32 + %30 = arith.select %29, %28, %c0_i32 : i32 + %31 = triton_gpu.local_load %arg11 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %32 = triton_gpu.local_load %arg12 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %33 = arith.mulf %32, %cst_0 : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %34 = tt.dot %31, %33, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> + %35 = arith.addi %arg10, %c1_i32 : i32 + %36 = arith.cmpi slt, %35, %c1_i32 : i32 + %37 = arith.select %36, %35, %c0_i32 : i32 + %38 = triton_gpu.memdesc_subview %15[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %27, %38 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %39 = triton_gpu.memdesc_subview %16[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %24, %39 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %26, %23, %34, %30, %37, %38, %39 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %15 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %16 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + tt.return %19#2 : tensor<128x128xf32, #mma> + } + +// CHECK-LABEL: tt.func @matmul_loop_nested +// CHECK: %[[FOR_0:.*]] = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}) + +// CHECK: %[[SPLAT_1:.*]] = tt.splat %{{.*}} +// CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[MAKE_RANGE_3:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} +// CHECK: %[[EXPAND_DIMS_4:.*]] = tt.expand_dims %[[MAKE_RANGE_3]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_5:.*]] = tt.broadcast %[[EXPAND_DIMS_4]] +// CHECK: %[[SPLAT_6:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[ADDPTR_7:.*]] = tt.addptr %[[SPLAT_1]], %[[BROADCAST_5]] +// CHECK: %[[LOAD_8:.*]] = tt.load %[[ADDPTR_7]], %[[SPLAT_6]], %{{.*}} +// CHECK: %[[MAKE_RANGE_9:.*]] = tt.make_range {end = 128 : i32, start = 0 : i32} +// CHECK: %[[EXPAND_DIMS_10:.*]] = tt.expand_dims %[[MAKE_RANGE_9]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_11:.*]] = tt.broadcast %[[EXPAND_DIMS_10]] +// CHECK: %[[SPLAT_12:.*]] = tt.splat %{{.*}} +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[ADDPTR_14:.*]] = tt.addptr %[[SPLAT_12]], %[[BROADCAST_11]] +// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]], %[[SPLAT_13]], %{{.*}} +// CHECK: %[[LOCAL_ALLOC_16:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_17:.*]] = triton_gpu.local_alloc +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_16]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_17]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_19]] +// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %[[ADDPTR_7]], %[[ARG9:.*]] = %[[ADDPTR_14]], %[[ARG10:.*]] = %[[ARG6]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) + +// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] +// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]], %{{.*}} +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_26]], %{{.*}} +// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[LOCAL_LOAD_33:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_32]], %[[LOCAL_LOAD_33]], %[[ARG10]] +// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} +// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_16]][%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_38]] +// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_17]][%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: scf.yield %[[ADDPTR_24]], %[[ADDPTR_27]], %[[DOT_34]], %[[SELECT_31]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_16]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_17]] +// CHECK: scf.yield %{{.*}}#2 +// CHECK: } + + tt.func @matmul_loop_nested(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { + %c1_i32 = arith.constant 1 : i32 + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<4> : tensor<32x128xi32, #blocked> + %cst_0 = arith.constant dense<4> : tensor<128x32xi32, #blocked1> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #blocked1> + %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> + %0 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %cst_3) -> (tensor<128x128xf32, #mma>) { + %1 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> + %2 = arith.cmpi slt, %arg0, %arg1 : index + %3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %4 = tt.expand_dims %3 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> + %5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> + %6 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> + %7 = tt.splat %2 : i1 -> tensor<32x128xi1, #blocked> + %8 = tt.addptr %6, %5 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %9 = tt.load %8, %7, %cst_1 : tensor<32x128x!tt.ptr, #blocked> + %10 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %12 = tt.broadcast %11 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> + %13 = tt.splat %2 : i1 -> tensor<128x32xi1, #blocked1> + %14 = tt.addptr %1, %12 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> + %15 = tt.load %14, %13, %cst_2 : tensor<128x32x!tt.ptr, #blocked1> + %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %15, %18 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %19 = triton_gpu.memdesc_subview %17[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %9, %19 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %20:7 = scf.for %arg7 = %arg0 to %arg1 step %arg2 iter_args(%arg8 = %14, %arg9 = %8, %arg10 = %arg6, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %18, %arg14 = %19) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + %21 = arith.subi %arg1, %arg2 : index + %22 = arith.cmpi slt, %arg7, %21 : index + %23 = tt.splat %22 : i1 -> tensor<32x128xi1, #blocked> + %24 = tt.addptr %arg9, %cst : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %25 = tt.load %24, %23, %cst_1 : tensor<32x128x!tt.ptr, #blocked> + %26 = tt.splat %22 : i1 -> tensor<128x32xi1, #blocked1> + %27 = tt.addptr %arg8, %cst_0 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> + %28 = tt.load %27, %26, %cst_2 : tensor<128x32x!tt.ptr, #blocked1> + %29 = arith.addi %arg11, %c1_i32 : i32 + %30 = arith.cmpi slt, %29, %c1_i32 : i32 + %31 = arith.select %30, %29, %c0_i32 : i32 + %32 = triton_gpu.local_load %arg13 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %33 = triton_gpu.local_load %arg14 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %34 = tt.dot %32, %33, %arg10 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> + %35 = arith.addi %arg12, %c1_i32 : i32 + %36 = arith.cmpi slt, %35, %c1_i32 : i32 + %37 = arith.select %36, %35, %c0_i32 : i32 + %38 = triton_gpu.memdesc_subview %16[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %28, %38 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %39 = triton_gpu.memdesc_subview %17[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %25, %39 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %27, %24, %34, %31, %37, %38, %39 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %16 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %17 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %20#2 : tensor<128x128xf32, #mma> + } + tt.return %0 : tensor<128x128xf32, #mma> + } + +// CHECK-LABEL: tt.func @matmul_loop_single_pipeline +// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_17]] +// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[SPLAT_19]], %{{.*}} +// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG10]] +// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[DOT_27:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[LOCAL_LOAD_25]], %[[ARG7]] +// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} +// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_30]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_21]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: scf.yield %[[ADDPTR_20]], %[[DOT_27]], %[[SELECT_24]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: } + + tt.func @matmul_loop_single_pipeline(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { + %c1_i32 = arith.constant 1 : i32 + %0 = arith.cmpi slt, %arg0, %arg1 : index + %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> + %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> + %4 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> + %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> + %5 = tt.splat %0 : i1 -> tensor<32x128xi1, #blocked> + %6 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %7 = tt.load %6, %5, %cst : tensor<32x128x!tt.ptr, #blocked> + %8 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %9 = tt.expand_dims %8 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %10 = tt.broadcast %9 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> + %11 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> + %12 = tt.addptr %11, %10 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> + %13 = tt.load %12 : tensor<128x32x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst_0 = arith.constant dense<4> : tensor<32x128xi32, #blocked> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> + %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %15 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %7, %15 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + %16:5 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %6, %arg7 = %cst_1, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %15) -> (tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { + %17 = arith.subi %arg1, %arg2 : index + %18 = arith.cmpi slt, %arg5, %17 : index + %19 = tt.splat %18 : i1 -> tensor<32x128xi1, #blocked> + %20 = tt.addptr %arg6, %cst_0 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> + %21 = tt.load %20, %19, %cst : tensor<32x128x!tt.ptr, #blocked> + %22 = arith.addi %arg8, %c1_i32 : i32 + %23 = arith.cmpi slt, %22, %c1_i32 : i32 + %24 = arith.select %23, %22, %c0_i32 : i32 + %25 = triton_gpu.local_load %arg10 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %26 = triton_gpu.convert_layout %13 : tensor<128x32xf16, #blocked1> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %27 = tt.dot %26, %25, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> + %28 = arith.addi %arg9, %c1_i32 : i32 + %29 = arith.cmpi slt, %28, %c1_i32 : i32 + %30 = arith.select %29, %28, %c0_i32 : i32 + %31 = triton_gpu.memdesc_subview %14[%30, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %21, %31 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %20, %27, %24, %30, %31 : tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> + tt.return %16#1 : tensor<128x128xf32, #mma> + } + +// CHECK-LABEL: tt.func @indirect_bmm_scalar +// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}, %[[ARG15:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_28]], %[[SPLAT_27]] +// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[CMPI_26]] +// CHECK: %[[MULI_32:.*]] = arith.muli %{{.*}}, %[[LOAD_31]] +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[MULI_32]] +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[SPLAT_33]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_34]] +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} +// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_45:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_46:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_45]], %[[LOCAL_LOAD_46]], %[[ARG7]] +// CHECK: scf.yield %[[DOT_47]], %[[ADDPTR_28]], %[[ADDPTR_30]], %[[SELECT_44]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]], %[[LOAD_29]], %[[LOAD_36]] +// CHECK: } + + tt.func @indirect_bmm_scalar(%arg0: i64 {tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: !tt.ptr, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %0 = arith.cmpi sgt, %arg1, %c1 : index + %c1_i32 = arith.constant 1 : i32 + %1 = tt.addptr %arg3, %c1_i32 : !tt.ptr, i32 + %2 = tt.load %1, %0 : !tt.ptr + %3 = arith.muli %arg0, %2 : i64 + %4 = tt.splat %3 : i64 -> tensor<16x16xi64, #blocked> + %5 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked> + %6 = tt.addptr %arg5, %4 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %7 = tt.load %6, %5 : tensor<16x16x!tt.ptr, #blocked> + %8 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked1> + %9 = tt.addptr %arg2, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %10 = tt.load %9, %8 : tensor<16x16x!tt.ptr, #blocked1> + %c0 = arith.constant 0 : index + %11 = arith.cmpi sgt, %arg1, %c0 : index + %12 = tt.load %arg3, %11 : !tt.ptr + %13 = arith.muli %arg0, %12 : i64 + %14 = tt.splat %13 : i64 -> tensor<16x16xi64, #blocked> + %15 = tt.splat %11 : i1 -> tensor<16x16xi1, #blocked> + %16 = tt.addptr %arg5, %14 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %17 = tt.load %16, %15 : tensor<16x16x!tt.ptr, #blocked> + %18 = tt.splat %11 : i1 -> tensor<16x16xi1, #blocked1> + %19 = tt.load %arg2, %18 : tensor<16x16x!tt.ptr, #blocked1> + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %20 = triton_gpu.local_alloc : () -> !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %21 = triton_gpu.local_alloc : () -> !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %22 = triton_gpu.memdesc_subview %20[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %19, %22 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %23 = triton_gpu.memdesc_subview %21[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %17, %23 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %24:9 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %9, %arg9 = %1, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %22, %arg13 = %23, %arg14 = %10, %arg15 = %7) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16x16xf16, #blocked1>, tensor<16x16xf16, #blocked>) { + %25 = arith.subi %arg1, %c2 : index + %26 = arith.cmpi slt, %arg6, %25 : index + %27 = tt.addptr %arg9, %c1_i32 : !tt.ptr, i32 + %28 = tt.load %27, %26 : !tt.ptr + %29 = arith.muli %arg0, %28 : i64 + %30 = tt.splat %29 : i64 -> tensor<16x16xi64, #blocked> + %31 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> + %32 = tt.addptr %arg5, %30 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %33 = tt.load %32, %31 : tensor<16x16x!tt.ptr, #blocked> + %34 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> + %35 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %36 = tt.load %35, %34 : tensor<16x16x!tt.ptr, #blocked1> + %37 = arith.addi %arg11, %c1_i32 : i32 + %38 = arith.cmpi slt, %37, %c2_i32 : i32 + %39 = arith.select %38, %37, %c0_i32 : i32 + %40 = triton_gpu.memdesc_subview %21[%39, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg15, %40 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %41 = triton_gpu.memdesc_subview %20[%39, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %arg14, %41 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %42 = arith.addi %arg10, %c1_i32 : i32 + %43 = arith.cmpi slt, %42, %c2_i32 : i32 + %44 = arith.select %43, %42, %c0_i32 : i32 + %45 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %46 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %47 = tt.dot %45, %46, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> + scf.yield %47, %35, %27, %44, %39, %41, %40, %36, %33 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16x16xf16, #blocked1>, tensor<16x16xf16, #blocked> + } + triton_gpu.local_dealloc %20 : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %21 : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + tt.return %24#0 : tensor<16x16xf32, #mma> + } + +// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[SPLAT_19]] +// CHECK: %[[LOAD_22:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// CHECK: %[[MULI_23:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[MULI_23]] +// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[SPLAT_24]] +// CHECK: %[[LOAD_27:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_25]] +// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} +// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[DOT_33:.*]] = tt.dot %[[LOCAL_LOAD_31]], %[[LOCAL_LOAD_32]], %[[ARG7]] +// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} +// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_21]], %[[MEMDESC_SUBVIEW_38]] +// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: scf.yield %[[DOT_33]], %[[ADDPTR_20]], %[[ADDPTR_34]], %[[LOAD_22]], %[[SELECT_30]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] +// CHECK: } + + tt.func @indirect_bmm_scalar_dist_one(%arg0: i64 {tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: !tt.ptr, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %0 = arith.cmpi sgt, %arg1, %c0 : index + %1 = tt.load %arg3 : !tt.ptr + %2 = arith.muli %arg0, %1 : i64 + %3 = tt.splat %2 : i64 -> tensor<16x16xi64, #blocked> + %4 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked> + %5 = tt.addptr %arg5, %3 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %6 = tt.load %5, %4 : tensor<16x16x!tt.ptr, #blocked> + %c1_i32 = arith.constant 1 : i32 + %7 = tt.addptr %arg3, %c1_i32 : !tt.ptr, i32 + %8 = tt.load %7, %0 : !tt.ptr + %9 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked1> + %10 = tt.load %arg2, %9 : tensor<16x16x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1 = arith.constant 1 : index + %11 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %12 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %13 = tt.addptr %7, %c1_i32 : !tt.ptr, i32 + %14 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %10, %14 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %15 = triton_gpu.memdesc_subview %12[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %6, %15 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %16:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %13, %arg10 = %8, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %14, %arg14 = %15) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i64, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>) { + %17 = arith.subi %arg1, %c1 : index + %18 = arith.cmpi slt, %arg6, %17 : index + %19 = arith.muli %arg0, %arg10 : i64 + %20 = tt.splat %19 : i64 -> tensor<16x16xi64, #blocked> + %21 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked> + %22 = tt.addptr %arg5, %20 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %23 = tt.load %22, %21 : tensor<16x16x!tt.ptr, #blocked> + %24 = tt.load %arg9, %18 : !tt.ptr + %25 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked1> + %26 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %27 = tt.load %26, %25 : tensor<16x16x!tt.ptr, #blocked1> + %28 = arith.addi %arg11, %c1_i32 : i32 + %29 = arith.cmpi slt, %28, %c1_i32 : i32 + %30 = arith.select %29, %28, %c0_i32 : i32 + %31 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %32 = triton_gpu.local_load %arg14 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %33 = tt.dot %31, %32, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> + %34 = tt.addptr %arg9, %c1_i32 : !tt.ptr, i32 + %35 = arith.addi %arg12, %c1_i32 : i32 + %36 = arith.cmpi slt, %35, %c1_i32 : i32 + %37 = arith.select %36, %35, %c0_i32 : i32 + %38 = triton_gpu.memdesc_subview %11[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %27, %38 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %39 = triton_gpu.memdesc_subview %12[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %23, %39 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + scf.yield %33, %26, %34, %24, %30, %37, %38, %39 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i64, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %11 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %12 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + tt.return %16#0 : tensor<16x16xf32, #mma> + } + +// CHECK-LABEL: tt.func @indirect_bmm_vector +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_21]] +// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]] +// CHECK: %[[EXPAND_DIMS_26:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[EXPAND_DIMS_26]] +// CHECK: %[[MULI_28:.*]] = arith.muli %{{.*}}, %[[BROADCAST_27]] +// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[MULI_28]] +// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[SPLAT_29]] +// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_32]] +// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] +// CHECK: %[[ADDI_36:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_37:.*]] = arith.cmpi slt, %[[ADDI_36]], %{{.*}} +// CHECK: %[[SELECT_38:.*]] = arith.select %[[CMPI_37]], %[[ADDI_36]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_39:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_40:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_39]], %[[LOCAL_LOAD_40]], %[[ARG7]] +// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} +// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_45:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_45]] +// CHECK: %[[MEMDESC_SUBVIEW_46:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_31]], %[[MEMDESC_SUBVIEW_46]] +// CHECK: scf.yield %[[DOT_41]], %[[ADDPTR_24]], %[[ADDPTR_34]], %[[SELECT_38]], %[[SELECT_44]], %[[MEMDESC_SUBVIEW_45]], %[[MEMDESC_SUBVIEW_46]], %[[LOAD_35]] +// CHECK: } + + tt.func @indirect_bmm_vector(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %0 = arith.cmpi sgt, %arg1, %c1 : index + %cst = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.splat %0 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.addptr %arg3, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.load %2, %1 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %c0 = arith.constant 0 : index + %4 = arith.cmpi sgt, %arg1, %c0 : index + %5 = tt.splat %4 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = tt.load %arg3, %5 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %8 = tt.broadcast %7 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %9 = arith.muli %arg0, %8 : tensor<16x16xi64, #blocked> + %10 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked> + %11 = tt.addptr %arg5, %9 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %12 = tt.load %11, %10 : tensor<16x16x!tt.ptr, #blocked> + %13 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked1> + %14 = tt.load %arg2, %13 : tensor<16x16x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %12, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %19:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_0, %arg8 = %arg2, %arg9 = %2, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %17, %arg13 = %18, %arg14 = %3) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { + %20 = arith.subi %arg1, %c2 : index + %21 = arith.cmpi slt, %arg6, %20 : index + %22 = tt.splat %21 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %23 = tt.addptr %arg9, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %24 = tt.load %23, %22 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %25 = arith.subi %arg1, %c1 : index + %26 = arith.cmpi slt, %arg6, %25 : index + %27 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %28 = tt.broadcast %27 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %29 = arith.muli %arg0, %28 : tensor<16x16xi64, #blocked> + %30 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> + %31 = tt.addptr %arg5, %29 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %32 = tt.load %31, %30 : tensor<16x16x!tt.ptr, #blocked> + %33 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> + %34 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %35 = tt.load %34, %33 : tensor<16x16x!tt.ptr, #blocked1> + %36 = arith.addi %arg10, %c1_i32 : i32 + %37 = arith.cmpi slt, %36, %c1_i32 : i32 + %38 = arith.select %37, %36, %c0_i32 : i32 + %39 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %40 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %41 = tt.dot %39, %40, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> + %42 = arith.addi %arg11, %c1_i32 : i32 + %43 = arith.cmpi slt, %42, %c1_i32 : i32 + %44 = arith.select %43, %42, %c0_i32 : i32 + %45 = triton_gpu.memdesc_subview %15[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %35, %45 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + %46 = triton_gpu.memdesc_subview %16[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %32, %46 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + scf.yield %41, %34, %23, %38, %44, %45, %46, %24 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } + triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %16 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> + tt.return %19#0 : tensor<16x16xf32, #mma> + } + +// CHECK-LABEL: tt.func @post_load_inv +// CHECK: %{{.*}}:5 = scf.for %[[ARG9:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG9]], %{{.*}} +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[INDEX_CAST_21:.*]] = arith.index_cast %[[ADDI_20]] +// CHECK: %[[MULI_22:.*]] = arith.muli %[[INDEX_CAST_21]], %{{.*}} +// CHECK: %[[SUBI_23:.*]] = arith.subi %{{.*}}, %[[MULI_22]] +// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[SUBI_23]] +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_24]] +// CHECK: %[[BROADCAST_26:.*]] = tt.broadcast %[[CMPI_25]] +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[INDEX_CAST_28:.*]] = arith.index_cast %[[ARG9]] +// CHECK: %[[ADDI_29:.*]] = arith.addi %[[INDEX_CAST_28]], %{{.*}} +// CHECK: %[[MULI_30:.*]] = arith.muli %[[ADDI_29]], %{{.*}} +// CHECK: %[[SPLAT_31:.*]] = tt.splat %[[MULI_30]] +// CHECK: %[[ANDI_32:.*]] = arith.andi %[[SPLAT_27]], %[[BROADCAST_26]] +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_31]] +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_33]], %[[ANDI_32]], %{{.*}} +// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[SUBI_23]] +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_35]] +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[CMPI_36]] +// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[MULI_39:.*]] = arith.muli %[[MULI_30]], %{{.*}} +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[MULI_39]] +// CHECK: %[[ANDI_41:.*]] = arith.andi %[[SPLAT_38]], %[[BROADCAST_37]] +// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_40]] +// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_42]], %[[ANDI_41]], %{{.*}} +// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_47:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[LOCAL_LOAD_48:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[DOT_49:.*]] = tt.dot %[[LOCAL_LOAD_47]], %[[LOCAL_LOAD_48]], %[[ARG10]] +// CHECK: %[[ADDI_50:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_51:.*]] = arith.cmpi slt, %[[ADDI_50]], %{{.*}} +// CHECK: %[[SELECT_52:.*]] = arith.select %[[CMPI_51]], %[[ADDI_50]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_53:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_52]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_53]] +// CHECK: %[[MEMDESC_SUBVIEW_54:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_52]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_43]], %[[MEMDESC_SUBVIEW_54]] +// CHECK: scf.yield %[[DOT_49]], %[[SELECT_46]], %[[SELECT_52]], %[[MEMDESC_SUBVIEW_53]], %[[MEMDESC_SUBVIEW_54]] +// CHECK: } + + tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #mma> { + %c899 = arith.constant 899 : index + %0 = tt.splat %arg5 : i32 -> tensor<32x1xi32, #blocked1> + %1 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked1> + %2 = arith.cmpi slt, %1, %0 : tensor<32x1xi32, #blocked1> + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> + %3 = tt.broadcast %2 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %5 = tt.load %4, %3, %cst : tensor<32x32x!tt.ptr, #blocked1> + %6 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %7 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #blocked1> + %8 = arith.cmpi slt, %7, %6 : tensor<1x32xi32, #blocked1> + %9 = tt.broadcast %8 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %10 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %11 = tt.load %10, %9, %cst : tensor<32x32x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %c900 = arith.constant 900 : index + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %12 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %13 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + %16 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %11, %16 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %5, %17 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + %18:5 = scf.for %arg9 = %c0 to %c900 step %c1 iter_args(%arg10 = %cst_0, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %16, %arg14 = %17) -> (tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable>) { + %19 = arith.cmpi slt, %arg9, %c899 : index + %20 = arith.addi %arg9, %c1 : index + %21 = arith.index_cast %20 : index to i32 + %22 = arith.muli %21, %c32_i32 : i32 + %23 = arith.subi %arg5, %22 : i32 + %24 = tt.splat %23 : i32 -> tensor<32x1xi32, #blocked1> + %25 = arith.cmpi slt, %1, %24 : tensor<32x1xi32, #blocked1> + %26 = tt.broadcast %25 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %27 = tt.splat %19 : i1 -> tensor<32x32xi1, #blocked1> + %28 = arith.index_cast %arg9 : index to i32 + %29 = arith.addi %28, %c1_i32 : i32 + %30 = arith.muli %29, %c32_i32 : i32 + %31 = arith.muli %30, %arg7 : i32 + %32 = tt.splat %31 : i32 -> tensor<32x32xi32, #blocked1> + %33 = arith.andi %27, %26 : tensor<32x32xi1, #blocked1> + %34 = tt.addptr %13, %32 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> + %35 = tt.load %34, %33, %cst : tensor<32x32x!tt.ptr, #blocked1> + %36 = tt.splat %23 : i32 -> tensor<1x32xi32, #blocked1> + %37 = arith.cmpi slt, %7, %36 : tensor<1x32xi32, #blocked1> + %38 = tt.broadcast %37 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %39 = tt.splat %19 : i1 -> tensor<32x32xi1, #blocked1> + %40 = tt.splat %30 : i32 -> tensor<32x32xi32, #blocked1> + %41 = arith.andi %39, %38 : tensor<32x32xi1, #blocked1> + %42 = tt.addptr %12, %40 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> + %43 = tt.load %42, %41, %cst : tensor<32x32x!tt.ptr, #blocked1> + %44 = arith.addi %arg11, %c1_i32 : i32 + %45 = arith.cmpi slt, %44, %c1_i32 : i32 + %46 = arith.select %45, %44, %c0_i32 : i32 + %47 = triton_gpu.local_load %arg13 : !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %48 = triton_gpu.local_load %arg14 : !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %49 = tt.dot %47, %48, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %50 = arith.addi %arg12, %c1_i32 : i32 + %51 = arith.cmpi slt, %50, %c1_i32 : i32 + %52 = arith.select %51, %50, %c0_i32 : i32 + %53 = triton_gpu.memdesc_subview %14[%52, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %43, %53 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + %54 = triton_gpu.memdesc_subview %15[%52, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %35, %54 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + scf.yield %49, %46, %52, %53, %54 : tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %15 : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> + tt.return %18#0 : tensor<32x32xf32, #mma> + } + +// CHECK-LABEL: tt.func @cross_iter_dep +// CHECK: %{{.*}}:5 = scf.for %[[ARG9:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[INDEX_CAST_9:.*]] = arith.index_cast %[[ARG9]] +// CHECK: %[[MULI_10:.*]] = arith.muli %[[INDEX_CAST_9]], %{{.*}} +// CHECK: %[[SUBI_11:.*]] = arith.subi %{{.*}}, %[[MULI_10]] +// CHECK: %[[SPLAT_12:.*]] = tt.splat %[[SUBI_11]] +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_12]] +// CHECK: %[[BROADCAST_14:.*]] = tt.broadcast %[[CMPI_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %[[ARG11]], %[[BROADCAST_14]], %{{.*}} +// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[SUBI_11]] +// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_16]] +// CHECK: %[[BROADCAST_18:.*]] = tt.broadcast %[[CMPI_17]] +// CHECK: %[[LOAD_19:.*]] = tt.load %[[ARG12]], %[[BROADCAST_18]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_20:.*]] = triton_gpu.convert_layout %[[LOAD_15]] +// CHECK: %[[CONVERT_LAYOUT_21:.*]] = triton_gpu.convert_layout %[[LOAD_19]] +// CHECK: %[[DOT_22:.*]] = tt.dot %[[CONVERT_LAYOUT_20]], %[[CONVERT_LAYOUT_21]], %[[ARG10]] +// CHECK: %[[INDEX_CAST_23:.*]] = arith.index_cast %[[ARG9]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[INDEX_CAST_23]], %{{.*}} +// CHECK: %[[MULI_25:.*]] = arith.muli %[[ADDI_24]], %{{.*}} +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[MULI_25]] +// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %{{.*}}, %[[SPLAT_26]] +// CHECK: %[[MULI_28:.*]] = arith.muli %[[MULI_25]], %{{.*}} +// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[MULI_28]] +// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[SPLAT_29]] +// CHECK: scf.yield %[[DOT_22]], %[[ARG13]], %[[ARG14]], %[[ADDPTR_27]], %[[ADDPTR_30]] +// CHECK: } + + tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #mma> { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c1 = arith.constant 1 : index + %c2_i32 = arith.constant 2 : i32 + %c32_i32 = arith.constant 32 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> + %0 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %1 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %2 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %3 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %4 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %5 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #blocked1> + %6 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked1> + %7 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %8:5 = scf.for %arg9 = %c0 to %c32 step %c1 iter_args(%arg10 = %cst, %arg11 = %0, %arg12 = %1, %arg13 = %3, %arg14 = %4) -> (tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>) { + %9 = arith.index_cast %arg9 : index to i32 + %10 = arith.muli %9, %c32_i32 : i32 + %11 = arith.subi %arg5, %10 : i32 + %12 = tt.splat %11 : i32 -> tensor<32x1xi32, #blocked1> + %13 = arith.cmpi slt, %6, %12 : tensor<32x1xi32, #blocked1> + %14 = tt.broadcast %13 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %15 = tt.load %arg12, %14, %cst_0 : tensor<32x32x!tt.ptr, #blocked1> + %16 = tt.splat %11 : i32 -> tensor<1x32xi32, #blocked1> + %17 = arith.cmpi slt, %5, %16 : tensor<1x32xi32, #blocked1> + %18 = tt.broadcast %17 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> + %19 = tt.load %arg11, %18, %cst_0 : tensor<32x32x!tt.ptr, #blocked1> + %20 = triton_gpu.convert_layout %19 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %21 = triton_gpu.convert_layout %15 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %22 = tt.dot %20, %21, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %23 = arith.index_cast %arg9 : index to i32 + %24 = arith.addi %23, %c2_i32 : i32 + %25 = arith.muli %24, %c32_i32 : i32 + %26 = tt.splat %25 : i32 -> tensor<32x32xi32, #blocked1> + %27 = tt.addptr %7, %26 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> + %28 = arith.muli %25, %arg7 : i32 + %29 = tt.splat %28 : i32 -> tensor<32x32xi32, #blocked1> + %30 = tt.addptr %2, %29 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> + scf.yield %22, %arg13, %arg14, %27, %30 : tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1> + } + tt.return %8#0 : tensor<32x32xf32, #mma> + } + +// CHECK-LABEL: tt.func @dep_arg_two_uses +// CHECK: %{{.*}}:5 = scf.for %[[ARG3:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_8:.*]] = arith.subi %{{.*}}, %[[ARG3]] +// CHECK: %[[INDEX_CAST_9:.*]] = arith.index_cast %[[SUBI_8]] +// CHECK: %[[SPLAT_10:.*]] = tt.splat %[[INDEX_CAST_9]] +// CHECK: %[[CMPI_11:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_10]] +// CHECK: %[[EXPAND_DIMS_12:.*]] = tt.expand_dims %[[CMPI_11]] {axis = 0 : i32} +// CHECK: %[[EXPAND_DIMS_13:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} +// CHECK: %[[EXTSI_14:.*]] = arith.extsi %[[EXPAND_DIMS_13]] +// CHECK: %[[MULI_15:.*]] = arith.muli %[[EXTSI_14]], %{{.*}} +// CHECK: %[[BROADCAST_16:.*]] = tt.broadcast %[[MULI_15]] +// CHECK: %[[BROADCAST_17:.*]] = tt.broadcast %[[EXPAND_DIMS_12]] +// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %[[ARG4]], %[[BROADCAST_16]] +// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[BROADCAST_17]] +// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[ARG6]] +// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %[[SPLAT_20]], %{{.*}} +// CHECK: %[[LOAD_22:.*]] = tt.load %[[ADDPTR_21]] +// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[INDEX_CAST_9]] +// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_23]] +// CHECK: %[[EXPAND_DIMS_25:.*]] = tt.expand_dims %[[CMPI_24]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_26:.*]] = tt.broadcast %[[EXPAND_DIMS_25]] +// CHECK: %[[LOAD_27:.*]] = tt.load %[[ARG8]], %[[BROADCAST_26]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_28:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} +// CHECK: %[[EXTSI_29:.*]] = arith.extsi %[[EXPAND_DIMS_28]] +// CHECK: %[[MULI_30:.*]] = arith.muli %[[EXTSI_29]], %{{.*}} +// CHECK: %[[BROADCAST_31:.*]] = tt.broadcast %[[MULI_30]] +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG4]], %[[BROADCAST_31]] +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_34:.*]] = triton_gpu.convert_layout %[[LOAD_19]] +// CHECK: %[[CONVERT_LAYOUT_35:.*]] = triton_gpu.convert_layout %[[LOAD_27]] +// CHECK: %[[DOT_36:.*]] = tt.dot %[[CONVERT_LAYOUT_34]], %[[CONVERT_LAYOUT_35]], %[[ARG7]] +// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: scf.yield %[[ADDPTR_32]], %[[LOAD_22]], %[[ADDPTR_33]], %[[DOT_36]], %[[ADDPTR_37]] +// CHECK: } + + tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { + %cst = arith.constant dense<64> : tensor<32x128xi64, #blocked> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> + %c32_i32 = arith.constant 32 : i32 + %cst_1 = arith.constant dense<64> : tensor<1x32xi64, #blocked1> + %c0 = arith.constant 0 : index + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> + %c32 = arith.constant 32 : index + %c100 = arith.constant 100 : index + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> + %5 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> + %6 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 + %7:5 = scf.for %arg3 = %c0 to %c100 step %c32 iter_args(%arg4 = %4, %arg5 = %3, %arg6 = %6, %arg7 = %cst_2, %arg8 = %5) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, !tt.ptr, tensor<128x128xf32, #mma>, tensor<32x128x!tt.ptr, #blocked>) { + %8 = arith.subi %c100, %arg3 : index + %9 = arith.index_cast %8 : index to i32 + %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %11 = arith.cmpi slt, %2, %10 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi1, #blocked> + %13 = tt.broadcast %12 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> + %14 = tt.load %arg8, %13, %cst_0 : tensor<32x128x!tt.ptr, #blocked> + %15 = tt.splat %arg6 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %16 = tt.addptr %15, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %18 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %19 = arith.cmpi slt, %1, %18 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %20 = tt.expand_dims %19 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi1, #blocked1> + %21 = tt.expand_dims %arg5 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %22 = arith.extsi %21 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %23 = arith.muli %22, %cst_1 : tensor<1x32xi64, #blocked1> + %24 = tt.broadcast %23 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> + %25 = tt.broadcast %20 : tensor<1x32xi1, #blocked1> -> tensor<128x32xi1, #blocked1> + %26 = tt.addptr %arg4, %24 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi64, #blocked1> + %27 = tt.load %26, %25 : tensor<128x32x!tt.ptr, #blocked1> + %28 = tt.expand_dims %arg5 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %29 = arith.extsi %28 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %30 = arith.muli %29, %cst_1 : tensor<1x32xi64, #blocked1> + %31 = tt.broadcast %30 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> + %32 = tt.addptr %arg4, %31 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi64, #blocked1> + %33 = tt.addptr %arg6, %c32_i32 : !tt.ptr, i32 + %34 = triton_gpu.convert_layout %27 : tensor<128x32xf16, #blocked1> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %35 = triton_gpu.convert_layout %14 : tensor<32x128xf16, #blocked> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %36 = tt.dot %34, %35, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> + %37 = tt.addptr %arg8, %cst : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi64, #blocked> + scf.yield %32, %17, %33, %36, %37 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, !tt.ptr, tensor<128x128xf32, #mma>, tensor<32x128x!tt.ptr, #blocked> + } + tt.return %7#3 : tensor<128x128xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @load_two_users +// CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} +// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_23:.*]] = tt.load %{{.*}}, %[[SPLAT_22]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG7]] +// CHECK: %[[DOT_29:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[LOCAL_LOAD_28]], %{{.*}} +// CHECK: %[[TRUNCF_30:.*]] = arith.truncf %[[DOT_29]] +// CHECK: %[[CONVERT_LAYOUT_31:.*]] = triton_gpu.convert_layout %[[TRUNCF_30]] +// CHECK: %[[TRANS_32:.*]] = tt.trans %[[ARG7]] {order = array} +// CHECK: %[[LOCAL_LOAD_33:.*]] = triton_gpu.local_load %[[TRANS_32]] +// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_31]], %[[LOCAL_LOAD_33]], %[[ARG4]] +// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} +// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_23]], %[[MEMDESC_SUBVIEW_38]] +// CHECK: scf.yield %[[DOT_29]], %[[DOT_34]], %[[SELECT_26]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]] +// CHECK: } + + tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %c7_i32 = arith.constant 7 : i32 + %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %c0_i64 = arith.constant 0 : i64 + %2 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %3 = tt.splat %2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %4 = tt.addptr %3, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %5 = tt.broadcast %1 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %6 = tt.broadcast %4 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %8 = tt.load %7 : tensor<64x16x!tt.ptr, #blocked> + %9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %10 = tt.expand_dims %9 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %11 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %12 = tt.splat %11 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %13 = tt.addptr %12, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %14 = tt.broadcast %10 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %15 = tt.broadcast %13 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %16 = tt.addptr %15, %14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %17 = tt.load %16 : tensor<128x64x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %18 = triton_gpu.local_alloc : () -> !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %19 = triton_gpu.memdesc_subview %18[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %8, %19 : tensor<64x16xf16, #blocked> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %20:5 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2, %arg5 = %c-1_i32, %arg6 = %c0_i32, %arg7 = %19) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable>) : i32 { + %21 = arith.cmpi slt, %arg2, %c7_i32 : i32 + %22 = tt.splat %21 : i1 -> tensor<64x16xi1, #blocked> + %23 = tt.load %7, %22 : tensor<64x16x!tt.ptr, #blocked> + %24 = arith.addi %arg5, %c1_i32 : i32 + %25 = arith.cmpi slt, %24, %c1_i32 : i32 + %26 = arith.select %25, %24, %c0_i32 : i32 + %27 = triton_gpu.convert_layout %17 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %28 = triton_gpu.local_load %arg7 : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %29 = tt.dot %27, %28, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %30 = arith.truncf %29 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %31 = triton_gpu.convert_layout %30 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %32 = tt.trans %arg7 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %33 = triton_gpu.local_load %32 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %34 = tt.dot %31, %33, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + %35 = arith.addi %arg6, %c1_i32 : i32 + %36 = arith.cmpi slt, %35, %c1_i32 : i32 + %37 = arith.select %36, %35, %c0_i32 : i32 + %38 = triton_gpu.memdesc_subview %18[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %23, %38 : tensor<64x16xf16, #blocked> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %29, %34, %26, %37, %38 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %18 : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + tt.return %20#0, %20#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts +// CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} +// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[LOAD_21:.*]] = tt.load %{{.*}}, %[[SPLAT_20]] +// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// CHECK: %[[ADDI_25:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ADDI_25]], %{{.*}} +// CHECK: %[[SELECT_27:.*]] = arith.select %[[CMPI_26]], %[[ADDI_25]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[ARG7]] +// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_28]], %[[CONVERT_LAYOUT_29]], %{{.*}} +// CHECK: %[[TRUNCF_31:.*]] = arith.truncf %[[DOT_30]] +// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[TRUNCF_31]] +// CHECK: %[[LOCAL_ALLOC_33:.*]] = triton_gpu.local_alloc %[[ARG7]] +// CHECK: %[[TRANS_34:.*]] = tt.trans %[[LOCAL_ALLOC_33]] {order = array} +// CHECK: %[[LOCAL_LOAD_35:.*]] = triton_gpu.local_load %[[TRANS_34]] +// CHECK: %[[DOT_36:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[LOCAL_LOAD_35]], %[[ARG4]] +// CHECK: scf.yield %[[DOT_30]], %[[DOT_36]], %[[SELECT_24]], %[[SELECT_27]], %[[LOAD_21]] +// CHECK: } + + tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %c7_i32 = arith.constant 7 : i32 + %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %c0_i64 = arith.constant 0 : i64 + %2 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %3 = tt.splat %2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %4 = tt.addptr %3, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %5 = tt.broadcast %1 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %6 = tt.broadcast %4 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %7 = tt.addptr %6, %5 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %8 = tt.load %7 : tensor<64x16x!tt.ptr, #blocked> + %9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %10 = tt.expand_dims %9 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %11 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %12 = tt.splat %11 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %13 = tt.addptr %12, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %14 = tt.broadcast %10 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %15 = tt.broadcast %13 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %16 = tt.addptr %15, %14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %17 = tt.load %16 : tensor<128x64x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %18:5 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2, %arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %8) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, tensor<64x16xf16, #blocked>) : i32 { + %19 = arith.cmpi slt, %arg2, %c7_i32 : i32 + %20 = tt.splat %19 : i1 -> tensor<64x16xi1, #blocked> + %21 = tt.load %7, %20 : tensor<64x16x!tt.ptr, #blocked> + %22 = arith.addi %arg5, %c1_i32 : i32 + %23 = arith.cmpi slt, %22, %c1_i32 : i32 + %24 = arith.select %23, %22, %c0_i32 : i32 + %25 = arith.addi %arg6, %c1_i32 : i32 + %26 = arith.cmpi slt, %25, %c1_i32 : i32 + %27 = arith.select %26, %25, %c0_i32 : i32 + %28 = triton_gpu.convert_layout %17 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %29 = triton_gpu.convert_layout %arg7 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %30 = tt.dot %28, %29, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %31 = arith.truncf %30 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %32 = triton_gpu.convert_layout %31 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %33 = triton_gpu.local_alloc %arg7 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %34 = tt.trans %33 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %35 = triton_gpu.local_load %34 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %36 = tt.dot %32, %35, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %30, %36, %24, %27, %21 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, tensor<64x16xf16, #blocked> + } + tt.return %18#0, %18#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { + +// CHECK: %[[MULI_9:.*]] = arith.muli %[[ARG4]], %{{.*}} +// CHECK: %[[SPLAT_10:.*]] = tt.splat %[[MULI_9]] +// CHECK: %[[ADDI_11:.*]] = arith.addi %[[SPLAT_10]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_12:.*]] = tt.expand_dims %[[ADDI_11]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_13:.*]] = tt.broadcast %[[EXPAND_DIMS_12]] +// CHECK: %[[ADDPTR_14:.*]] = tt.addptr %{{.*}}, %[[BROADCAST_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]] +// CHECK: %[[EXPAND_DIMS_16:.*]] = tt.expand_dims %{{.*}} {axis = 0 : i32} +// CHECK: %[[SPLAT_17:.*]] = tt.splat %[[MULI_9]] +// CHECK: %[[ADDI_18:.*]] = arith.addi %[[SPLAT_17]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_19:.*]] = tt.expand_dims %[[ADDI_18]] {axis = 1 : i32} +// CHECK: %[[MULI_20:.*]] = arith.muli %[[EXPAND_DIMS_19]], %{{.*}} +// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[MULI_20]] +// CHECK: %[[BROADCAST_22:.*]] = tt.broadcast %[[EXPAND_DIMS_16]] +// CHECK: %[[BROADCAST_23:.*]] = tt.broadcast %[[ADDPTR_21]] +// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[BROADCAST_23]], %[[BROADCAST_22]] +// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[MULI_20]] +// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[ADDPTR_26]] +// CHECK: %[[LOCAL_ALLOC_28:.*]] = triton_gpu.local_alloc +// CHECK: %[[MEMDESC_SUBVIEW_29:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_28]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_29]] +// CHECK: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[MEMDESC_SUBVIEW_29]], %[[ARG9:.*]] = %[[BROADCAST_22]]) + +// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ARG5]], %{{.*}} +// CHECK: %[[ADDI_32:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[MULI_33:.*]] = arith.muli %[[ADDI_32]], %{{.*}} +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[MULI_33]] +// CHECK: %[[ADDI_35:.*]] = arith.addi %[[SPLAT_34]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ADDI_35]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[CMPI_31]] +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %[[BROADCAST_23]], %[[BROADCAST_37]] +// CHECK: %[[LOAD_40:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_38]] +// CHECK: %[[ADDI_41:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_42:.*]] = arith.cmpi slt, %[[ADDI_41]], %{{.*}} +// CHECK: %[[SELECT_43:.*]] = arith.select %[[CMPI_42]], %[[ADDI_41]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_44:.*]] = triton_gpu.local_load %[[ARG8]] +// CHECK: %[[CONVERT_LAYOUT_45:.*]] = triton_gpu.convert_layout %[[LOAD_15]] +// CHECK: %[[DOT_46:.*]] = tt.dot %[[LOCAL_LOAD_44]], %[[CONVERT_LAYOUT_45]], %{{.*}} +// CHECK: %[[ADDPTR_47:.*]] = tt.addptr %[[BROADCAST_27]], %[[ARG9]] +// CHECK: %[[CONVERT_LAYOUT_48:.*]] = triton_gpu.convert_layout %[[DOT_46]] +// CHECK: tt.store %[[ADDPTR_47]], %[[CONVERT_LAYOUT_48]] +// CHECK: %[[ADDI_49:.*]] = arith.addi %[[ARG7]], %{{.*}} +// CHECK: %[[CMPI_50:.*]] = arith.cmpi slt, %[[ADDI_49]], %{{.*}} +// CHECK: %[[SELECT_51:.*]] = arith.select %[[CMPI_50]], %[[ADDI_49]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_52:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_28]][%[[SELECT_51]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_40]], %[[MEMDESC_SUBVIEW_52]] +// CHECK: scf.yield %[[SELECT_43]], %[[SELECT_51]], %[[MEMDESC_SUBVIEW_52]], %[[BROADCAST_37]] +// CHECK: } + + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %c9_i32 = arith.constant 9 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %c10_i32 = arith.constant 10 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> + %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { + %9 = arith.muli %arg4, %c32_i32 : i32 + %10 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %14 = arith.muli %13, %cst_0 : tensor<32x1xi32, #blocked> + %15 = tt.addptr %7, %14 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %16 = tt.broadcast %10 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %17 = tt.broadcast %15 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %18 = tt.addptr %17, %16 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %19 = tt.load %18 : tensor<32x32x!tt.ptr, #blocked> + %20 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %21 = arith.addi %20, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %23 = tt.broadcast %22 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %24 = tt.addptr %6, %23 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %25 = tt.load %24 : tensor<32x32x!tt.ptr, #blocked> + %26 = tt.addptr %8, %14 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %27 = tt.broadcast %26 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %28 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + %29 = triton_gpu.memdesc_subview %28[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %19, %29 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + %30:4 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %c-1_i32, %arg7 = %c0_i32, %arg8 = %29, %arg9 = %16) -> (i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>, tensor<32x32xi32, #blocked>) : i32 { + %31 = arith.cmpi slt, %arg5, %c9_i32 : i32 + %32 = arith.addi %arg5, %c1_i32 : i32 + %33 = arith.muli %32, %c32_i32 : i32 + %34 = tt.splat %33 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %35 = arith.addi %34, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %36 = tt.expand_dims %35 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %37 = tt.broadcast %36 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %38 = tt.splat %31 : i1 -> tensor<32x32xi1, #blocked> + %39 = tt.addptr %17, %37 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %40 = tt.load %39, %38 : tensor<32x32x!tt.ptr, #blocked> + %41 = arith.addi %arg6, %c1_i32 : i32 + %42 = arith.cmpi slt, %41, %c1_i32 : i32 + %43 = arith.select %42, %41, %c0_i32 : i32 + %44 = triton_gpu.local_load %arg8 : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %45 = triton_gpu.convert_layout %25 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %46 = tt.dot %44, %45, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %47 = tt.addptr %27, %arg9 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %48 = triton_gpu.convert_layout %46 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %47, %48 : tensor<32x32x!tt.ptr, #blocked> + %49 = arith.addi %arg7, %c1_i32 : i32 + %50 = arith.cmpi slt, %49, %c1_i32 : i32 + %51 = arith.select %50, %49, %c0_i32 : i32 + %52 = triton_gpu.memdesc_subview %28[%51, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %40, %52 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %43, %51, %52, %37 : i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>, tensor<32x32xi32, #blocked> + } + triton_gpu.local_dealloc %28 : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + } + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de +// CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_76:.*]] = arith.cmpi slt, %[[ARG6]], %{{.*}} +// CHECK: %[[SPLAT_77:.*]] = tt.splat %[[CMPI_76]] +// CHECK: %[[LOAD_78:.*]] = tt.load %{{.*}}, %[[SPLAT_77]] +// CHECK: %[[SPLAT_79:.*]] = tt.splat %[[CMPI_76]] +// CHECK: %[[LOAD_80:.*]] = tt.load %{{.*}}, %[[SPLAT_79]] +// CHECK: %[[ADDI_81:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_82:.*]] = arith.cmpi slt, %[[ADDI_81]], %{{.*}} +// CHECK: %[[SELECT_83:.*]] = arith.select %[[CMPI_82]], %[[ADDI_81]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_84:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[TRANS_85:.*]] = tt.trans %[[ARG10]] {order = array} +// CHECK: %[[LOCAL_LOAD_86:.*]] = triton_gpu.local_load %[[TRANS_85]] +// CHECK: %[[DOT_87:.*]] = tt.dot %[[CONVERT_LAYOUT_84]], %[[LOCAL_LOAD_86]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_88:.*]] = triton_gpu.convert_layout %[[DOT_87]] +// CHECK: %[[LOCAL_LOAD_89:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[DOT_90:.*]] = tt.dot %[[CONVERT_LAYOUT_88]], %[[LOCAL_LOAD_89]], %[[ARG7]] +// CHECK: %[[ADDI_91:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_92:.*]] = arith.cmpi slt, %[[ADDI_91]], %{{.*}} +// CHECK: %[[SELECT_93:.*]] = arith.select %[[CMPI_92]], %[[ADDI_91]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_94:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_93]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_78]], %[[MEMDESC_SUBVIEW_94]] +// CHECK: %[[MEMDESC_SUBVIEW_95:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_93]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_80]], %[[MEMDESC_SUBVIEW_95]] +// CHECK: scf.yield %[[DOT_90]], %[[SELECT_83]], %[[SELECT_93]], %[[MEMDESC_SUBVIEW_94]], %[[MEMDESC_SUBVIEW_95]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %{{.*}} +// CHECK: triton_gpu.local_dealloc %{{.*}} +// CHECK: %[[BROADCAST_70:.*]] = tt.broadcast %{{.*}} +// CHECK: %[[BROADCAST_71:.*]] = tt.broadcast %{{.*}} +// CHECK: %[[ADDI_72:.*]] = arith.addi %[[BROADCAST_70]], %[[BROADCAST_71]] +// CHECK: %[[SPLAT_73:.*]] = tt.splat %{{.*}} +// CHECK: %[[ADDPTR_74:.*]] = tt.addptr %[[SPLAT_73]], %[[ADDI_72]] +// CHECK: %[[CONVERT_LAYOUT_75:.*]] = triton_gpu.convert_layout %{{.*}}#0 +// CHECK: tt.store %[[ADDPTR_74]], %[[CONVERT_LAYOUT_75]] + + tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %c1_i32 = arith.constant 1 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> + %2 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %3 = arith.muli %2, %1 : tensor<1x32xi32, #blocked> + %4 = arith.extsi %3 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> + %5 = tt.get_program_id y : i32 + %6 = arith.muli %5, %arg5 : i32 + %7 = arith.extsi %6 : i32 to i64 + %8 = arith.extsi %arg5 : i32 to i64 + %9 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %11 = tt.load %arg3 : !tt.ptr + %12 = arith.extsi %10 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> + %13 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked> + %14 = tt.splat %8 : i64 -> tensor<32x1xi64, #blocked> + %15 = arith.addi %13, %12 : tensor<32x1xi64, #blocked> + %16 = tt.splat %7 : i64 -> tensor<32x1xi64, #blocked> + %17 = arith.muli %15, %14 : tensor<32x1xi64, #blocked> + %18 = arith.addi %17, %16 : tensor<32x1xi64, #blocked> + %19 = tt.broadcast %4 : tensor<1x32xi64, #blocked> -> tensor<32x32xi64, #blocked> + %20 = tt.broadcast %18 : tensor<32x1xi64, #blocked> -> tensor<32x32xi64, #blocked> + %21 = arith.addi %20, %19 : tensor<32x32xi64, #blocked> + %22 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %23 = tt.addptr %22, %21 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi64, #blocked> + %24 = tt.load %23 : tensor<32x32x!tt.ptr, #blocked> + %25 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %26 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> + %27 = tt.expand_dims %25 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> + %28 = arith.muli %27, %26 : tensor<1x64xi32, #blocked> + %29 = arith.extsi %28 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> + %30 = tt.broadcast %29 : tensor<1x64xi64, #blocked> -> tensor<32x64xi64, #blocked> + %31 = tt.broadcast %18 : tensor<32x1xi64, #blocked> -> tensor<32x64xi64, #blocked> + %32 = arith.addi %31, %30 : tensor<32x64xi64, #blocked> + %33 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked> + %34 = tt.addptr %33, %32 : tensor<32x64x!tt.ptr, #blocked>, tensor<32x64xi64, #blocked> + %35 = tt.load %34 : tensor<32x64x!tt.ptr, #blocked> + %36 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %37 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> + %38 = tt.expand_dims %36 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %39 = arith.muli %38, %37 : tensor<1x64xi32, #blocked1> + %40 = arith.extsi %39 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> + %c64_i32 = arith.constant 64 : i32 + %41 = tt.get_program_id x : i32 + %42 = arith.muli %41, %c64_i32 : i32 + %43 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %44 = tt.splat %42 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %45 = arith.addi %44, %43 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %46 = tt.expand_dims %45 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> + %47 = arith.extsi %46 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> + %48 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked1> + %49 = tt.splat %8 : i64 -> tensor<64x1xi64, #blocked1> + %50 = arith.addi %48, %47 : tensor<64x1xi64, #blocked1> + %51 = tt.splat %7 : i64 -> tensor<64x1xi64, #blocked1> + %52 = arith.muli %50, %49 : tensor<64x1xi64, #blocked1> + %53 = arith.addi %52, %51 : tensor<64x1xi64, #blocked1> + %54 = tt.broadcast %40 : tensor<1x64xi64, #blocked1> -> tensor<64x64xi64, #blocked1> + %55 = tt.broadcast %53 : tensor<64x1xi64, #blocked1> -> tensor<64x64xi64, #blocked1> + %56 = arith.addi %55, %54 : tensor<64x64xi64, #blocked1> + %57 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> + %58 = tt.addptr %57, %56 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi64, #blocked1> + %59 = tt.load %58 : tensor<64x64x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %60 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %61 = tt.expand_dims %60 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %62 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %63 = arith.muli %61, %62 : tensor<1x32xi32, #blocked1> + %64 = arith.extsi %63 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %65 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %66 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %67 = triton_gpu.memdesc_subview %65[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %35, %67 : tensor<32x64xf32, #blocked> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %68 = triton_gpu.memdesc_subview %66[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %24, %68 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %69:5 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %67, %arg11 = %68) -> (tensor<64x32xf32, #mma>, i32, i32, !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable>) : i32 { + %76 = arith.cmpi slt, %arg6, %c32_i32 : i32 + %77 = tt.splat %76 : i1 -> tensor<32x32xi1, #blocked> + %78 = tt.load %23, %77 : tensor<32x32x!tt.ptr, #blocked> + %79 = tt.splat %76 : i1 -> tensor<32x64xi1, #blocked> + %80 = tt.load %34, %79 : tensor<32x64x!tt.ptr, #blocked> + %81 = arith.addi %arg8, %c1_i32 : i32 + %82 = arith.cmpi slt, %81, %c1_i32 : i32 + %83 = arith.select %82, %81, %c0_i32 : i32 + %84 = triton_gpu.convert_layout %59 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %85 = tt.trans %arg10 {order = array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory> + %86 = triton_gpu.local_load %85 : !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %87 = tt.dot %84, %86, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %88 = triton_gpu.convert_layout %87 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %89 = triton_gpu.local_load %arg11 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %90 = tt.dot %88, %89, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %91 = arith.addi %arg9, %c1_i32 : i32 + %92 = arith.cmpi slt, %91, %c1_i32 : i32 + %93 = arith.select %92, %91, %c0_i32 : i32 + %94 = triton_gpu.memdesc_subview %65[%93, %c0_i32, %c0_i32] : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %80, %94 : tensor<32x64xf32, #blocked> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %95 = triton_gpu.memdesc_subview %66[%93, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %78, %95 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + scf.yield %90, %83, %93, %94, %95 : tensor<64x32xf32, #mma>, i32, i32, !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %65 : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %66 : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %70 = tt.broadcast %53 : tensor<64x1xi64, #blocked1> -> tensor<64x32xi64, #blocked1> + %71 = tt.broadcast %64 : tensor<1x32xi64, #blocked1> -> tensor<64x32xi64, #blocked1> + %72 = arith.addi %70, %71 : tensor<64x32xi64, #blocked1> + %73 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked1> + %74 = tt.addptr %73, %72 : tensor<64x32x!tt.ptr, #blocked1>, tensor<64x32xi64, #blocked1> + %75 = triton_gpu.convert_layout %69#0 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked1> + tt.store %74, %75 : tensor<64x32x!tt.ptr, #blocked1> + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @indirect_load_shared_layout +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_21]] +// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]] +// CHECK: %[[EXPAND_DIMS_26:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[EXPAND_DIMS_26]] +// CHECK: %[[MULI_28:.*]] = arith.muli %{{.*}}, %[[BROADCAST_27]] +// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[MULI_28]] +// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[SPLAT_29]] +// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_32]] +// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] +// CHECK: %[[ADDI_36:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_37:.*]] = arith.cmpi slt, %[[ADDI_36]], %{{.*}} +// CHECK: %[[SELECT_38:.*]] = arith.select %[[CMPI_37]], %[[ADDI_36]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_39:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_40:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_39]], %[[LOCAL_LOAD_40]], %[[ARG7]] +// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} +// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_45:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_45]] +// CHECK: %[[MEMDESC_SUBVIEW_46:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_31]], %[[MEMDESC_SUBVIEW_46]] +// CHECK: scf.yield %[[DOT_41]], %[[ADDPTR_24]], %[[ADDPTR_34]], %[[SELECT_38]], %[[SELECT_44]], %[[MEMDESC_SUBVIEW_45]], %[[MEMDESC_SUBVIEW_46]], %[[LOAD_35]] +// CHECK: } + + tt.func @indirect_load_shared_layout(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %0 = arith.cmpi sgt, %arg1, %c1 : index + %cst = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.splat %0 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.addptr %arg3, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.load %2, %1 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %c0 = arith.constant 0 : index + %4 = arith.cmpi sgt, %arg1, %c0 : index + %5 = tt.splat %4 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = tt.load %arg3, %5 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %8 = tt.broadcast %7 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %9 = arith.muli %arg0, %8 : tensor<16x16xi64, #blocked> + %10 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked> + %11 = tt.addptr %arg5, %9 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %12 = tt.load %11, %10 : tensor<16x16x!tt.ptr, #blocked> + %13 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked1> + %14 = tt.load %arg2, %13 : tensor<16x16x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %14, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %12, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %19:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_0, %arg8 = %arg2, %arg9 = %2, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %17, %arg13 = %18, %arg14 = %3) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { + %20 = arith.subi %arg1, %c2 : index + %21 = arith.cmpi slt, %arg6, %20 : index + %22 = tt.splat %21 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %23 = tt.addptr %arg9, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %24 = tt.load %23, %22 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %25 = arith.subi %arg1, %c1 : index + %26 = arith.cmpi slt, %arg6, %25 : index + %27 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %28 = tt.broadcast %27 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %29 = arith.muli %arg0, %28 : tensor<16x16xi64, #blocked> + %30 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> + %31 = tt.addptr %arg5, %29 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %32 = tt.load %31, %30 : tensor<16x16x!tt.ptr, #blocked> + %33 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> + %34 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %35 = tt.load %34, %33 : tensor<16x16x!tt.ptr, #blocked1> + %36 = arith.addi %arg10, %c1_i32 : i32 + %37 = arith.cmpi slt, %36, %c1_i32 : i32 + %38 = arith.select %37, %36, %c0_i32 : i32 + %39 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %40 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %41 = tt.dot %39, %40, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> + %42 = arith.addi %arg11, %c1_i32 : i32 + %43 = arith.cmpi slt, %42, %c1_i32 : i32 + %44 = arith.select %43, %42, %c0_i32 : i32 + %45 = triton_gpu.memdesc_subview %15[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %35, %45 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %46 = triton_gpu.memdesc_subview %16[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %32, %46 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %41, %34, %23, %38, %44, %45, %46, %24 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } + triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %16 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + tt.return %19#0 : tensor<16x16xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @kernel_yield_constant +// CHECK: %{{.*}}:4 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[ADDI_18:.*]] = arith.addi %[[ARG7]], %{{.*}} +// CHECK: %[[MULI_19:.*]] = arith.muli %[[ADDI_18]], %{{.*}} +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %[[MULI_19]] +// CHECK: %[[SPLAT_21:.*]] = tt.splat %[[SUBI_20]] +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_21]] +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_17]] +// CHECK: %[[BROADCAST_24:.*]] = tt.broadcast %[[CMPI_22]] +// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[MULI_26:.*]] = arith.muli %[[MULI_19]], %{{.*}} +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[MULI_26]] +// CHECK: %[[ANDI_28:.*]] = arith.andi %[[SPLAT_25]], %[[BROADCAST_24]] +// CHECK: %[[ADDPTR_29:.*]] = tt.addptr %{{.*}}, %[[SPLAT_27]] +// CHECK: %[[LOAD_30:.*]] = tt.load %[[ADDPTR_29]], %[[ANDI_28]], %{{.*}} +// CHECK: %[[ADDI_31:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ADDI_31]], %{{.*}} +// CHECK: %[[SELECT_33:.*]] = arith.select %[[CMPI_32]], %[[ADDI_31]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_34:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[DOT_35:.*]] = tt.dot %{{.*}}, %[[LOCAL_LOAD_34]], %[[ARG8]] +// CHECK: %[[CONVERT_LAYOUT_36:.*]] = triton_gpu.convert_layout %[[DOT_35]] +// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_36]] +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_30]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: scf.yield %{{.*}}, %[[SELECT_33]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: } + + tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0> : tensor<32x32xi32, #blocked> + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked> + %2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = arith.cmpi slt, %2, %1 : tensor<32x1xi32, #blocked> + %c31_i32 = arith.constant 31 : i32 + %c32_i32 = arith.constant 32 : i32 + %4 = arith.addi %arg4, %c31_i32 : i32 + %c0_i32 = arith.constant 0 : i32 + %5 = arith.divsi %4, %c32_i32 : i32 + %6 = arith.cmpi sgt, %5, %c0_i32 : i32 + %7 = tt.broadcast %3 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> + %8 = tt.splat %6 : i1 -> tensor<32x32xi1, #blocked> + %9 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> + %10 = arith.andi %8, %7 : tensor<32x32xi1, #blocked> + %11 = tt.addptr %9, %cst : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %12 = tt.load %11, %10, %cst_0 : tensor<32x32x!tt.ptr, #blocked> + %c-1_i32 = arith.constant -1 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst_2 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %cst_3 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %13 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + %15 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %12, %15 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + %16:4 = scf.for %arg7 = %c0_i32 to %5 step %c1_i32 iter_args(%arg8 = %cst_1, %arg9 = %c-1_i32, %arg10 = %c0_i32, %arg11 = %15) -> (tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>) : i32 { + %17 = arith.subi %5, %c1_i32 : i32 + %18 = arith.addi %arg7, %c1_i32 : i32 + %19 = arith.muli %18, %c32_i32 : i32 + %20 = arith.subi %arg4, %19 : i32 + %21 = tt.splat %20 : i32 -> tensor<32x1xi32, #blocked> + %22 = arith.cmpi slt, %2, %21 : tensor<32x1xi32, #blocked> + %23 = arith.cmpi slt, %arg7, %17 : i32 + %24 = tt.broadcast %22 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> + %25 = tt.splat %23 : i1 -> tensor<32x32xi1, #blocked> + %26 = arith.muli %19, %arg5 : i32 + %27 = tt.splat %26 : i32 -> tensor<32x32xi32, #blocked> + %28 = arith.andi %25, %24 : tensor<32x32xi1, #blocked> + %29 = tt.addptr %9, %27 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %30 = tt.load %29, %28, %cst_0 : tensor<32x32x!tt.ptr, #blocked> + %31 = arith.addi %arg9, %c1_i32 : i32 + %32 = arith.cmpi slt, %31, %c1_i32 : i32 + %33 = arith.select %32, %31, %c0_i32 : i32 + %34 = triton_gpu.local_load %arg11 : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %35 = tt.dot %cst_3, %34, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %36 = triton_gpu.convert_layout %35 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %13, %36 : tensor<32x32x!tt.ptr, #blocked> + %37 = arith.addi %arg10, %c1_i32 : i32 + %38 = arith.cmpi slt, %37, %c1_i32 : i32 + %39 = arith.select %38, %37, %c0_i32 : i32 + %40 = triton_gpu.memdesc_subview %14[%39, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %30, %40 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %cst_2, %33, %39, %40 : tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @add_kernel +// CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG4]], %{{.*}} +// CHECK: %[[ADDI_25:.*]] = arith.addi %{{.*}}, %[[ADDI_24]] +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[ADDI_25]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[SPLAT_26]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[ANDI_30:.*]] = arith.andi %[[SPLAT_29]], %[[CMPI_28]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] +// CHECK: %[[LOAD_32:.*]] = tt.load %[[ADDPTR_31]], %[[ANDI_30]] +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[ANDI_34:.*]] = arith.andi %[[SPLAT_33]], %[[CMPI_28]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[ANDI_34]] +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[ADDI_40:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_41:.*]] = arith.cmpi slt, %[[ADDI_40]], %{{.*}} +// CHECK: %[[SELECT_42:.*]] = arith.select %[[CMPI_41]], %[[ADDI_40]], %{{.*}} +// CHECK: %[[ADDF_43:.*]] = arith.addf %[[ARG7]], %[[ARG9]] +// CHECK: %[[ADDPTR_44:.*]] = tt.addptr %{{.*}}, %[[ARG11]] +// CHECK: tt.store %[[ADDPTR_44]], %[[ADDF_43]], %[[ARG13]] +// CHECK: scf.yield %[[SELECT_39]], %[[SELECT_42]], %[[ARG8]], %[[LOAD_32]], %[[ARG10]], %[[LOAD_36]], %[[ARG12]], %[[ADDI_27]], %[[ARG14]], %[[CMPI_28]] +// CHECK: } + + tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c2048_i32 = arith.constant 2048 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %0 = tt.get_program_id x : i32 + %c1024_i32 = arith.constant 1024 : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = arith.addi %1, %c1024_i32 : i32 + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %4 = tt.splat %2 : i32 -> tensor<1024xi32, #blocked> + %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %6 = arith.addi %4, %3 : tensor<1024xi32, #blocked> + %7 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %8 = arith.cmpi slt, %6, %5 : tensor<1024xi32, #blocked> + %9 = tt.addptr %7, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = tt.load %9, %8 : tensor<1024x!tt.ptr, #blocked> + %11 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %12 = tt.addptr %11, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %13 = tt.load %12, %8 : tensor<1024x!tt.ptr, #blocked> + %14 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> + %15 = arith.addi %14, %3 : tensor<1024xi32, #blocked> + %16 = arith.cmpi slt, %15, %5 : tensor<1024xi32, #blocked> + %17 = tt.addptr %7, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %18 = tt.load %17, %16 : tensor<1024x!tt.ptr, #blocked> + %19 = tt.addptr %11, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %20 = tt.load %19, %16 : tensor<1024x!tt.ptr, #blocked> + %c1014752_i32 = arith.constant 1014752 : i32 + %c2_i32 = arith.constant 2 : i32 + %c1_i32 = arith.constant 1 : i32 + %c-1_i32 = arith.constant -1 : i32 + %c0_i32 = arith.constant 0 : i32 + %21 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %22:10 = scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %20, %arg8 = %13, %arg9 = %18, %arg10 = %10, %arg11 = %15, %arg12 = %6, %arg13 = %16, %arg14 = %8) -> (i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked>) : i32 { + %23 = arith.cmpi slt, %arg4, %c1014752_i32 : i32 + %24 = arith.addi %arg4, %c2048_i32 : i32 + %25 = arith.addi %1, %24 : i32 + %26 = tt.splat %25 : i32 -> tensor<1024xi32, #blocked> + %27 = arith.addi %26, %3 : tensor<1024xi32, #blocked> + %28 = arith.cmpi slt, %27, %5 : tensor<1024xi32, #blocked> + %29 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> + %30 = arith.andi %29, %28 : tensor<1024xi1, #blocked> + %31 = tt.addptr %7, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %32 = tt.load %31, %30 : tensor<1024x!tt.ptr, #blocked> + %33 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> + %34 = arith.andi %33, %28 : tensor<1024xi1, #blocked> + %35 = tt.addptr %11, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %36 = tt.load %35, %34 : tensor<1024x!tt.ptr, #blocked> + %37 = arith.addi %arg5, %c1_i32 : i32 + %38 = arith.cmpi slt, %37, %c2_i32 : i32 + %39 = arith.select %38, %37, %c0_i32 : i32 + %40 = arith.addi %arg6, %c1_i32 : i32 + %41 = arith.cmpi slt, %40, %c2_i32 : i32 + %42 = arith.select %41, %40, %c0_i32 : i32 + %43 = arith.addf %arg7, %arg9 : tensor<1024xf32, #blocked> + %44 = tt.addptr %21, %arg11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %44, %43, %arg13 : tensor<1024x!tt.ptr, #blocked> + scf.yield %39, %42, %arg8, %36, %arg10, %32, %arg12, %27, %arg14, %28 : i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked> + } + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: scf.for %[[ARG1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { + +// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} +// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}} +// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc %[[LOAD_10]] +// CHECK: %[[TRANS_13:.*]] = tt.trans %[[LOCAL_ALLOC_12]] {order = array} +// CHECK: %[[LOCAL_LOAD_14:.*]] = triton_gpu.local_load %[[TRANS_13]] +// CHECK: %[[LOCAL_ALLOC_15:.*]] = triton_gpu.local_alloc +// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_15]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_11]], %[[MEMDESC_SUBVIEW_16]] +// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) + +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} +// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_20:.*]] = tt.load %{{.*}}, %[[SPLAT_19]] +// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG3]], %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} +// CHECK: %[[SELECT_23:.*]] = arith.select %[[CMPI_22]], %[[ADDI_21]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_24:.*]] = triton_gpu.local_load %[[ARG5]] +// CHECK: %[[DOT_25:.*]] = tt.dot %[[LOCAL_LOAD_24]], %[[LOCAL_LOAD_14]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[DOT_25]] +// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_26]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG4]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_30:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_15]][%[[SELECT_29]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_20]], %[[MEMDESC_SUBVIEW_30]] +// CHECK: scf.yield %[[SELECT_23]], %[[SELECT_29]], %[[MEMDESC_SUBVIEW_30]] +// CHECK: } + + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> + %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> + %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> + %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> + %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> + %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> + %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> + scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %11 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %12 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> + %13 = tt.trans %12 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> + %14 = triton_gpu.local_load %13 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + %16 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %11, %16 : tensor<16x16xf32, #blocked> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + %17:3 = scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg3 = %c-1_i32, %arg4 = %c0_i32, %arg5 = %16) -> (i32, i32, !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable>) : i32 { + %18 = arith.cmpi slt, %arg2, %c1_i32 : i32 + %19 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked> + %20 = tt.load %9, %19 : tensor<16x16x!tt.ptr, #blocked> + %21 = arith.addi %arg3, %c1_i32 : i32 + %22 = arith.cmpi slt, %21, %c1_i32 : i32 + %23 = arith.select %22, %21, %c0_i32 : i32 + %24 = triton_gpu.local_load %arg5 : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %25 = tt.dot %24, %14, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> + %26 = triton_gpu.convert_layout %25 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> + tt.store %9, %26 : tensor<16x16x!tt.ptr, #blocked> + %27 = arith.addi %arg4, %c1_i32 : i32 + %28 = arith.cmpi slt, %27, %c1_i32 : i32 + %29 = arith.select %28, %27, %c0_i32 : i32 + %30 = triton_gpu.memdesc_subview %15[%29, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %20, %30 : tensor<16x16xf32, #blocked> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %23, %29, %30 : i32, i32, !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + } + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @load_convert_layout +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_24:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_28]], %[[SPLAT_27]] +// CHECK: %[[EXPAND_DIMS_30:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_31:.*]] = tt.broadcast %[[EXPAND_DIMS_30]] +// CHECK: %[[MULI_32:.*]] = arith.muli %{{.*}}, %[[BROADCAST_31]] +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %{{.*}}, %[[MULI_32]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] +// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_24]] +// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_36]] +// CHECK: %[[ANDI_38:.*]] = arith.andi %[[SPLAT_37]], %{{.*}} +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_40:.*]] = tt.load %[[ADDPTR_39]], %[[ANDI_38]] +// CHECK: %[[ADDI_41:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_42:.*]] = arith.cmpi slt, %[[ADDI_41]], %{{.*}} +// CHECK: %[[SELECT_43:.*]] = arith.select %[[CMPI_42]], %[[ADDI_41]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_44:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_45:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[DOT_46:.*]] = tt.dot %[[LOCAL_LOAD_44]], %[[LOCAL_LOAD_45]], %[[ARG7]] +// CHECK: %[[ADDI_47:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_48:.*]] = arith.cmpi slt, %[[ADDI_47]], %{{.*}} +// CHECK: %[[SELECT_49:.*]] = arith.select %[[CMPI_48]], %[[ADDI_47]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_50:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_49]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_50]] +// CHECK: %[[MEMDESC_SUBVIEW_51:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_49]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_51]] +// CHECK: scf.yield %[[DOT_46]], %[[ADDPTR_28]], %[[ADDPTR_39]], %[[SELECT_43]], %[[SELECT_49]], %[[MEMDESC_SUBVIEW_50]], %[[MEMDESC_SUBVIEW_51]], %[[LOAD_40]] +// CHECK: } + + tt.func @load_convert_layout(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { + %c2 = arith.constant 2 : index + %cst = arith.constant dense<2> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %c1 = arith.constant 1 : index + %1 = arith.cmpi sgt, %arg1, %c1 : index + %2 = arith.cmpi slt, %0, %cst : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %3 = tt.splat %1 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %cst_0 = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %4 = arith.andi %3, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %5 = tt.addptr %arg3, %cst_0 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = tt.load %5, %4 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %c0 = arith.constant 0 : index + %7 = arith.cmpi sgt, %arg1, %c0 : index + %8 = tt.splat %7 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %9 = arith.andi %8, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %10 = tt.load %arg3, %9 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %11 = tt.expand_dims %10 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %12 = tt.broadcast %11 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %13 = arith.muli %arg0, %12 : tensor<16x16xi64, #blocked> + %14 = tt.splat %7 : i1 -> tensor<16x16xi1, #blocked> + %15 = tt.addptr %arg5, %13 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %16 = tt.load %15, %14 : tensor<16x16x!tt.ptr, #blocked> + %17 = tt.splat %7 : i1 -> tensor<16x16xi1, #blocked1> + %18 = tt.load %arg2, %17 : tensor<16x16x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %c1_i32 = arith.constant 1 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %19 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %20 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %21 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %18, %21 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %22 = triton_gpu.memdesc_subview %20[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %16, %22 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %23:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_1, %arg8 = %arg2, %arg9 = %5, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %21, %arg13 = %22, %arg14 = %6) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { + %24 = arith.subi %arg1, %c2 : index + %25 = arith.cmpi slt, %arg6, %24 : index + %26 = tt.splat %25 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %27 = arith.andi %26, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %28 = tt.addptr %arg9, %cst_0 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %29 = tt.load %28, %27 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %30 = arith.subi %arg1, %c1 : index + %31 = arith.cmpi slt, %arg6, %30 : index + %32 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> + %33 = tt.broadcast %32 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> + %34 = arith.muli %arg0, %33 : tensor<16x16xi64, #blocked> + %35 = tt.splat %31 : i1 -> tensor<16x16xi1, #blocked> + %36 = tt.addptr %arg5, %34 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> + %37 = tt.load %36, %35 : tensor<16x16x!tt.ptr, #blocked> + %38 = tt.splat %31 : i1 -> tensor<16x16xi1, #blocked1> + %39 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> + %40 = tt.load %39, %38 : tensor<16x16x!tt.ptr, #blocked1> + %41 = arith.addi %arg10, %c1_i32 : i32 + %42 = arith.cmpi slt, %41, %c1_i32 : i32 + %43 = arith.select %42, %41, %c0_i32 : i32 + %44 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %45 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %46 = tt.dot %44, %45, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> + %47 = arith.addi %arg11, %c1_i32 : i32 + %48 = arith.cmpi slt, %47, %c1_i32 : i32 + %49 = arith.select %48, %47, %c0_i32 : i32 + %50 = triton_gpu.memdesc_subview %19[%49, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %40, %50 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %51 = triton_gpu.memdesc_subview %20[%49, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %37, %51 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %46, %39, %28, %43, %49, %50, %51, %29 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } + triton_gpu.local_dealloc %19 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_dealloc %20 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> + tt.return %23#0 : tensor<16x16xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @matmul_indirect_pipeline +// CHECK: %{{.*}}:4 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[ADDPTR_23:.*]] = tt.addptr %{{.*}}, %[[ARG8]] +// CHECK: %[[LOAD_24:.*]] = tt.load %[[ADDPTR_23]], %[[SPLAT_22]] +// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_20]] +// CHECK: %[[LOAD_26:.*]] = tt.load %{{.*}}, %[[SPLAT_25]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// CHECK: %[[ADDI_30:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ADDI_30]], %{{.*}} +// CHECK: %[[SELECT_32:.*]] = arith.select %[[CMPI_31]], %[[ADDI_30]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_33:.*]] = tt.expand_dims %[[ARG7]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_34:.*]] = tt.broadcast %[[EXPAND_DIMS_33]] +// CHECK: %[[ADDF_35:.*]] = arith.addf %{{.*}}, %[[BROADCAST_34]] +// CHECK: %[[CONVERT_LAYOUT_36:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_37:.*]] = triton_gpu.convert_layout %[[ADDF_35]] +// CHECK: %[[DOT_38:.*]] = tt.dot %[[CONVERT_LAYOUT_36]], %[[CONVERT_LAYOUT_37]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_39:.*]] = triton_gpu.convert_layout %[[DOT_38]] +// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_39]] +// CHECK: scf.yield %[[SELECT_29]], %[[SELECT_32]], %[[LOAD_24]], %[[LOAD_26]] +// CHECK: } + + tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %c-1_i32 = arith.constant -1 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %2 = tt.addptr %1, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %3 = tt.load %2 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %4 = tt.load %2 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %5 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %6 = tt.addptr %5, %4 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %7 = tt.load %6 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %8 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %9 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %11 = tt.broadcast %8 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %12 = tt.broadcast %10 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> + %13 = arith.addi %12, %11 : tensor<32x32xi32, #blocked> + %14 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %15 = tt.addptr %14, %13 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %16 = tt.load %15 : tensor<32x32x!tt.ptr, #blocked> + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %17 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %18 = tt.addptr %17, %13 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %19:4 = scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %7, %arg8 = %3) -> (i32, i32, tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) : i32 { + %20 = arith.cmpi slt, %arg4, %c0_i32 : i32 + %21 = tt.splat %20 : i1 -> tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %22 = tt.load %2, %21 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %23 = arith.cmpi slt, %arg4, %c1_i32 : i32 + %24 = tt.splat %23 : i1 -> tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %25 = tt.addptr %5, %arg8 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %26 = tt.load %25, %24 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %27 = arith.addi %arg5, %c1_i32 : i32 + %28 = arith.cmpi slt, %27, %c1_i32 : i32 + %29 = arith.select %28, %27, %c0_i32 : i32 + %30 = arith.addi %arg6, %c1_i32 : i32 + %31 = arith.cmpi slt, %30, %c1_i32 : i32 + %32 = arith.select %31, %30, %c0_i32 : i32 + %33 = tt.expand_dims %arg7 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> + %34 = tt.broadcast %33 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> + %35 = arith.addf %16, %34 : tensor<32x32xf32, #blocked> + %36 = triton_gpu.convert_layout %16 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %37 = triton_gpu.convert_layout %35 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %38 = tt.dot %36, %37, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %39 = triton_gpu.convert_layout %38 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %18, %39 : tensor<32x32x!tt.ptr, #blocked> + scf.yield %29, %32, %26, %22 : i32, i32, tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + } + tt.return + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80"} { + +// CHECK-LABEL: tt.func @matmul_nested_ops +// CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_19:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_19]] +// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[ADDPTR_22:.*]] = tt.addptr %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} +// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[CMPI_20]] +// CHECK: %[[IF_25:.*]] = scf.if %[[CMPI_23]] -> (tensor<128x32x!tt.ptr, #blocked1>) { + +// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ADDPTR_22]], %{{.*}} +// CHECK: scf.yield %[[ADDPTR_37]] +// CHECK: } else { + +// CHECK: scf.yield %[[ADDPTR_22]] +// CHECK: } + +// CHECK: %[[LOAD_26:.*]] = tt.load %[[IF_25]], %[[SPLAT_24]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[CONVERT_LAYOUT_31:.*]] = triton_gpu.convert_layout %{{.*}} +// CHECK: %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[CONVERT_LAYOUT_31]], %[[ARG7]] +// CHECK: %[[ADDI_33:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_34:.*]] = arith.cmpi slt, %[[ADDI_33]], %{{.*}} +// CHECK: %[[SELECT_35:.*]] = arith.select %[[CMPI_34]], %[[ADDI_33]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_36:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_35]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_26]], %[[MEMDESC_SUBVIEW_36]] +// CHECK: scf.yield %[[DOT_32]], %[[SELECT_29]], %[[SELECT_35]], %[[IF_25]], %[[MEMDESC_SUBVIEW_36]] +// CHECK: } + + tt.func @matmul_nested_ops(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: index) -> tensor<128x128xf32, #mma> { + %c1_i32 = arith.constant 1 : i32 + %0 = arith.cmpi slt, %arg0, %arg1 : index + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %3 = tt.broadcast %2 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked> + %4 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked> + %5 = tt.addptr %4, %3 : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> + %cst = arith.constant dense<4> : tensor<128x32xi32, #blocked> + %6 = arith.cmpi slt, %arg0, %arg5 : index + %7 = tt.splat %0 : i1 -> tensor<128x32xi1, #blocked> + %8 = scf.if %6 -> (tensor<128x32x!tt.ptr, #blocked>) { + %19 = tt.addptr %5, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> + scf.yield %19 : tensor<128x32x!tt.ptr, #blocked> + } else { + scf.yield %5 : tensor<128x32x!tt.ptr, #blocked> + } + %9 = tt.load %8, %7 : tensor<128x32x!tt.ptr, #blocked> + %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %11 = tt.expand_dims %10 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> + %12 = tt.broadcast %11 : tensor<1x128xi32, #blocked1> -> tensor<32x128xi32, #blocked1> + %13 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked1> + %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> + %15 = tt.load %14 : tensor<32x128x!tt.ptr, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %c-1_i32 = arith.constant -1 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> + %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %17 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %9, %17 : tensor<128x32xf16, #blocked> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + %18:5 = scf.for %arg6 = %arg0 to %arg1 step %arg2 iter_args(%arg7 = %cst_0, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %8, %arg11 = %17) -> (tensor<128x128xf32, #mma>, i32, i32, tensor<128x32x!tt.ptr, #blocked>, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>) { + %19 = arith.subi %arg1, %arg2 : index + %20 = arith.cmpi slt, %arg6, %19 : index + %21 = arith.addi %arg6, %arg2 : index + %22 = tt.addptr %arg10, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> + %23 = arith.cmpi slt, %21, %arg5 : index + %24 = tt.splat %20 : i1 -> tensor<128x32xi1, #blocked> + %25 = scf.if %23 -> (tensor<128x32x!tt.ptr, #blocked>) { + %37 = tt.addptr %22, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> + scf.yield %37 : tensor<128x32x!tt.ptr, #blocked> + } else { + scf.yield %22 : tensor<128x32x!tt.ptr, #blocked> + } + %26 = tt.load %25, %24 : tensor<128x32x!tt.ptr, #blocked> + %27 = arith.addi %arg8, %c1_i32 : i32 + %28 = arith.cmpi slt, %27, %c1_i32 : i32 + %29 = arith.select %28, %27, %c0_i32 : i32 + %30 = triton_gpu.local_load %arg11 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %31 = triton_gpu.convert_layout %15 : tensor<32x128xf16, #blocked1> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %32 = tt.dot %30, %31, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> + %33 = arith.addi %arg9, %c1_i32 : i32 + %34 = arith.cmpi slt, %33, %c1_i32 : i32 + %35 = arith.select %34, %33, %c0_i32 : i32 + %36 = triton_gpu.memdesc_subview %16[%35, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + triton_gpu.local_store %26, %36 : tensor<128x32xf16, #blocked> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + scf.yield %32, %29, %35, %25, %36 : tensor<128x128xf32, #mma>, i32, i32, tensor<128x32x!tt.ptr, #blocked>, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + } + triton_gpu.local_dealloc %16 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> + tt.return %18#0 : tensor<128x128xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @dot_prologue_epilogue +// CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[IF_14:.*]] = scf.if %[[CMPI_13]] -> (tensor<64x16x!tt.ptr, #blocked>) { + +// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: scf.yield %[[ADDPTR_30]] +// CHECK: } else { + +// CHECK: scf.yield %[[ARG6]] +// CHECK: } + +// CHECK: %[[LOAD_15:.*]] = tt.load %[[IF_14]] +// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[ADDPTR_17:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// CHECK: %[[LOAD_18:.*]] = tt.load %[[ADDPTR_17]], %[[SPLAT_16]] +// CHECK: %[[LOCAL_ALLOC_19:.*]] = triton_gpu.local_alloc %[[LOAD_15]] +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} +// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} +// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} +// CHECK: %[[LOCAL_ALLOC_26:.*]] = triton_gpu.local_alloc %[[ARG10]] +// CHECK: %[[WARP_GROUP_DOT_27:.*]] = triton_nvidia_gpu.warp_group_dot %[[LOCAL_ALLOC_26]], %[[LOCAL_ALLOC_19]], %[[ARG5]] +// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[IF_29:.*]] = scf.if %[[CMPI_13]] -> (tensor<128x16xf32, #mma>) { + +// CHECK: %[[MULF_30:.*]] = arith.mulf %[[WARP_GROUP_DOT_27]], %{{.*}} +// CHECK: scf.yield %[[MULF_30]] +// CHECK: } else { + +// CHECK: scf.yield %[[WARP_GROUP_DOT_27]] +// CHECK: } + +// CHECK: scf.yield %[[IF_29]], %[[ADDPTR_28]], %[[ADDPTR_17]], %[[SELECT_22]], %[[SELECT_25]], %[[LOAD_18]] +// CHECK: } + + tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma> { + %c7_i32 = arith.constant 7 : i32 + %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %2 = tt.broadcast %1 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %3 = tt.splat %arg1 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked1> + %4 = tt.addptr %3, %2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %5 = tt.load %4 : tensor<128x64x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %arg0 : !tt.ptr -> tensor<64x16x!tt.ptr, #blocked> + %9 = tt.broadcast %7 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %10 = tt.addptr %8, %9 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %11:6 = scf.for %arg4 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg5 = %cst_1, %arg6 = %10, %arg7 = %4, %arg8 = %c-1_i32, %arg9 = %c-1_i32, %arg10 = %5) -> (tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1>) : i32 { + %12 = arith.cmpi slt, %arg4, %c7_i32 : i32 + %13 = tt.splat %12 : i1 -> tensor<128x64xi1, #blocked1> + %14 = tt.addptr %arg7, %cst_0 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %15 = tt.load %14, %13 : tensor<128x64x!tt.ptr, #blocked1> + %16 = arith.cmpi slt, %arg4, %arg2 : i32 + %17 = scf.if %16 -> (tensor<64x16x!tt.ptr, #blocked>) { + %30 = tt.addptr %arg6, %arg3 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + scf.yield %30 : tensor<64x16x!tt.ptr, #blocked> + } else { + scf.yield %arg6 : tensor<64x16x!tt.ptr, #blocked> + } + %18 = tt.load %17 : tensor<64x16x!tt.ptr, #blocked> + %19 = arith.addi %arg8, %c1_i32 : i32 + %20 = arith.cmpi slt, %19, %c1_i32 : i32 + %21 = arith.select %20, %19, %c0_i32 : i32 + %22 = arith.addi %arg9, %c1_i32 : i32 + %23 = arith.cmpi slt, %22, %c1_i32 : i32 + %24 = arith.select %23, %22, %c0_i32 : i32 + %25 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %26 = triton_gpu.local_alloc %arg10 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> + %27 = triton_nvidia_gpu.warp_group_dot %26, %25, %arg5 : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> + %28 = tt.addptr %arg6, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %29 = scf.if %16 -> (tensor<128x16xf32, #mma>) { + %30 = arith.mulf %27, %cst_1 : tensor<128x16xf32, #mma> + scf.yield %30 : tensor<128x16xf32, #mma> + } else { + scf.yield %27 : tensor<128x16xf32, #mma> + } + scf.yield %29, %28, %14, %21, %24, %15 : tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1> + } + tt.return %11#0 : tensor<128x16xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func @pipeline_downstream_dependencies +// CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[LOAD_13:.*]] = tt.load %[[ARG6]] +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[ADDPTR_15:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_15]], %[[SPLAT_14]] +// CHECK: %[[LOCAL_ALLOC_17:.*]] = triton_gpu.local_alloc %[[LOAD_13]] +// CHECK: %[[ADDI_18:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ADDI_18]], %{{.*}} +// CHECK: %[[SELECT_20:.*]] = arith.select %[[CMPI_19]], %[[ADDI_18]], %{{.*}} +// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} +// CHECK: %[[SELECT_23:.*]] = arith.select %[[CMPI_22]], %[[ADDI_21]], %{{.*}} +// CHECK: %[[LOCAL_ALLOC_24:.*]] = triton_gpu.local_alloc %[[ARG10]] +// CHECK: %[[WARP_GROUP_DOT_25:.*]] = triton_nvidia_gpu.warp_group_dot %[[LOCAL_ALLOC_24]], %[[LOCAL_ALLOC_17]], %[[ARG5]] +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[SELECT_27:.*]] = arith.select %[[CMPI_26]], %{{.*}}, %{{.*}} +// CHECK: %[[IF_28:.*]] = scf.if %[[CMPI_26]] -> (tensor<128x16xf32, #mma>) { + +// CHECK: %[[MULF_30:.*]] = arith.mulf %[[WARP_GROUP_DOT_25]], %{{.*}} +// CHECK: scf.yield %[[MULF_30]] +// CHECK: } else { + +// CHECK: scf.yield %[[WARP_GROUP_DOT_25]] +// CHECK: } + +// CHECK: %[[ADDPTR_29:.*]] = tt.addptr %[[ARG6]], %[[SELECT_27]] +// CHECK: scf.yield %[[IF_28]], %[[ADDPTR_29]], %[[ADDPTR_15]], %[[SELECT_20]], %[[SELECT_23]], %[[LOAD_16]] +// CHECK: } + + tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma> { + %c7_i32 = arith.constant 7 : i32 + %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %2 = tt.broadcast %1 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %3 = tt.splat %arg1 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked1> + %4 = tt.addptr %3, %2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %5 = tt.load %4 : tensor<128x64x!tt.ptr, #blocked1> + %c-1_i32 = arith.constant -1 : i32 + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst_0 = arith.constant dense<1> : tensor<64x16xi32, #blocked> + %cst_1 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %arg0 : !tt.ptr -> tensor<64x16x!tt.ptr, #blocked> + %9 = tt.broadcast %7 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %10 = tt.addptr %8, %9 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %11:6 = scf.for %arg4 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg5 = %cst_2, %arg6 = %10, %arg7 = %4, %arg8 = %c-1_i32, %arg9 = %c-1_i32, %arg10 = %5) -> (tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1>) : i32 { + %12 = arith.cmpi slt, %arg4, %c7_i32 : i32 + %13 = tt.splat %12 : i1 -> tensor<128x64xi1, #blocked1> + %14 = tt.addptr %arg7, %cst_1 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %15 = tt.load %14, %13 : tensor<128x64x!tt.ptr, #blocked1> + %16 = tt.load %arg6 : tensor<64x16x!tt.ptr, #blocked> + %17 = arith.addi %arg8, %c1_i32 : i32 + %18 = arith.cmpi slt, %17, %c1_i32 : i32 + %19 = arith.select %18, %17, %c0_i32 : i32 + %20 = arith.addi %arg9, %c1_i32 : i32 + %21 = arith.cmpi slt, %20, %c1_i32 : i32 + %22 = arith.select %21, %20, %c0_i32 : i32 + %23 = triton_gpu.local_alloc %16 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %24 = triton_gpu.local_alloc %arg10 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> + %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg5 : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> + %26 = arith.cmpi slt, %arg4, %arg2 : i32 + %27 = arith.select %26, %cst, %cst_0 : tensor<64x16xi32, #blocked> + %28 = scf.if %26 -> (tensor<128x16xf32, #mma>) { + %30 = arith.mulf %25, %cst_2 : tensor<128x16xf32, #mma> + scf.yield %30 : tensor<128x16xf32, #mma> + } else { + scf.yield %25 : tensor<128x16xf32, #mma> + } + %29 = tt.addptr %arg6, %27 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + scf.yield %28, %29, %14, %19, %22, %15 : tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1> + } + tt.return %11#0 : tensor<128x16xf32, #mma> + } +} + +// ----- +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { + +// CHECK-LABEL: tt.func public @masked_add_kernel +// CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG4]], %{{.*}} +// CHECK: %[[ADDI_25:.*]] = arith.addi %{{.*}}, %[[ADDI_24]] +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[ADDI_25]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[SPLAT_26]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[ANDI_30:.*]] = arith.andi %[[SPLAT_29]], %[[CMPI_28]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] +// CHECK: %[[LOAD_32:.*]] = tt.load %[[ADDPTR_31]], %[[ANDI_30]], %{{.*}} +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[ANDI_34:.*]] = arith.andi %[[SPLAT_33]], %[[CMPI_28]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[ANDI_34]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG5]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[ADDI_40:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[CMPI_41:.*]] = arith.cmpi slt, %[[ADDI_40]], %{{.*}} +// CHECK: %[[SELECT_42:.*]] = arith.select %[[CMPI_41]], %[[ADDI_40]], %{{.*}} +// CHECK: %[[ADDF_43:.*]] = arith.addf %[[ARG7]], %[[ARG9]] +// CHECK: %[[ADDPTR_44:.*]] = tt.addptr %{{.*}}, %[[ARG11]] +// CHECK: tt.store %[[ADDPTR_44]], %[[ADDF_43]], %[[ARG13]] +// CHECK: scf.yield %[[SELECT_39]], %[[SELECT_42]], %[[ARG8]], %[[LOAD_32]], %[[ARG10]], %[[LOAD_36]], %[[ARG12]], %[[ADDI_27]], %[[ARG14]], %[[CMPI_28]] +// CHECK: } + + tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c2048_i32 = arith.constant 2048 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %0 = tt.get_program_id x : i32 + %c1024_i32 = arith.constant 1024 : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = arith.addi %1, %c1024_i32 : i32 + %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %4 = tt.splat %2 : i32 -> tensor<1024xi32, #blocked> + %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %6 = arith.addi %4, %3 : tensor<1024xi32, #blocked> + %7 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> + %8 = arith.cmpi slt, %6, %5 : tensor<1024xi32, #blocked> + %9 = tt.addptr %7, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %10 = tt.load %9, %8, %cst : tensor<1024x!tt.ptr, #blocked> + %11 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %12 = tt.addptr %11, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %13 = tt.load %12, %8, %cst : tensor<1024x!tt.ptr, #blocked> + %14 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> + %15 = arith.addi %14, %3 : tensor<1024xi32, #blocked> + %16 = arith.cmpi slt, %15, %5 : tensor<1024xi32, #blocked> + %17 = tt.addptr %7, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %18 = tt.load %17, %16, %cst : tensor<1024x!tt.ptr, #blocked> + %19 = tt.addptr %11, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %20 = tt.load %19, %16, %cst : tensor<1024x!tt.ptr, #blocked> + %c1014752_i32 = arith.constant 1014752 : i32 + %c2_i32 = arith.constant 2 : i32 + %c1_i32 = arith.constant 1 : i32 + %c-1_i32 = arith.constant -1 : i32 + %c0_i32 = arith.constant 0 : i32 + %21 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %22:10 = scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %20, %arg8 = %13, %arg9 = %18, %arg10 = %10, %arg11 = %15, %arg12 = %6, %arg13 = %16, %arg14 = %8) -> (i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked>) : i32 { + %23 = arith.cmpi slt, %arg4, %c1014752_i32 : i32 + %24 = arith.addi %arg4, %c2048_i32 : i32 + %25 = arith.addi %1, %24 : i32 + %26 = tt.splat %25 : i32 -> tensor<1024xi32, #blocked> + %27 = arith.addi %26, %3 : tensor<1024xi32, #blocked> + %28 = arith.cmpi slt, %27, %5 : tensor<1024xi32, #blocked> + %29 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> + %30 = arith.andi %29, %28 : tensor<1024xi1, #blocked> + %31 = tt.addptr %7, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %32 = tt.load %31, %30, %cst : tensor<1024x!tt.ptr, #blocked> + %33 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> + %34 = arith.andi %33, %28 : tensor<1024xi1, #blocked> + %35 = tt.addptr %11, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %36 = tt.load %35, %34, %cst : tensor<1024x!tt.ptr, #blocked> + %37 = arith.addi %arg5, %c1_i32 : i32 + %38 = arith.cmpi slt, %37, %c2_i32 : i32 + %39 = arith.select %38, %37, %c0_i32 : i32 + %40 = arith.addi %arg6, %c1_i32 : i32 + %41 = arith.cmpi slt, %40, %c2_i32 : i32 + %42 = arith.select %41, %40, %c0_i32 : i32 + %43 = arith.addf %arg7, %arg9 : tensor<1024xf32, #blocked> + %44 = tt.addptr %21, %arg11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %44, %43, %arg13 : tensor<1024x!tt.ptr, #blocked> + scf.yield %39, %42, %arg8, %36, %arg10, %32, %arg12, %27, %arg14, %28 : i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked> + } + tt.return + } +} diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index f9fac1bf5b0d..a7d7a9783c7b 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -26,14 +26,59 @@ using namespace mlir; static bool willIncreaseRegisterPressure(Operation *op) { if (isa(op)) return true; - auto cvt = dyn_cast(op); - if (!cvt) - return false; - if (isa(cvt.getType().getEncoding())) - return true; + if (auto cvt = dyn_cast(op)) + return isa( + cvt.getType().getEncoding()); + return false; +} + +static bool isDescendent(Operation *op, Block *block) { + Block *b = op->getBlock(); + while (b != nullptr) { + if (b == block) + return true; + b = b->getParentOp()->getBlock(); + } return false; } +static bool gatherDFG(Operation *op, Block *block, + SmallVector &dfg) { + // BFS (filo) + SmallVector oprs; + bool leadsToLoad = false; + for (auto operand : op->getOperands()) { + if (Operation *pop = operand.getDefiningOp()) { + if (isDescendent(pop, block)) { + // only move ops that reside in same block + if (pop->getBlock() == block) + dfg.push_back(pop); + oprs.push_back(pop); + leadsToLoad |= isa(pop); + } else { + // only operands from current block or ancestor + assert(isDescendent(block->getParentOp(), pop->getBlock())); + } + } + } + // check sub-regions + for (auto &subregion : op->getRegions()) { + for (auto &subblock : subregion) { + for (auto &sop : subblock) { + if (gatherDFG(&sop, block, dfg)) + leadsToLoad = true; + } + } + } + + // process next level ops + for (auto *op : oprs) { + if (gatherDFG(op, block, dfg)) + leadsToLoad = true; + } + return leadsToLoad; +} + class TritonAMDGPUReorderInstructionsPass : public TritonAMDGPUReorderInstructionsBase< TritonAMDGPUReorderInstructionsPass> { @@ -52,36 +97,53 @@ class TritonAMDGPUReorderInstructionsPass m.walk([&](Operation *op) { if (!willIncreaseRegisterPressure(op)) return; - auto user_begin = op->user_begin(); - auto user_end = op->user_end(); - if (std::distance(user_begin, user_end) != 1) + if (!op->hasOneUse()) return; - if (user_begin->getParentOfType() == + Operation *user = op->getUses().begin()->getOwner(); + if (user->getParentOfType() == op->getParentOfType()) return; - opToMove.insert({op, *user_begin}); + opToMove.insert({op, user}); }); for (auto &kv : opToMove) kv.first->moveBefore(kv.second); + opToMove.clear(); // Move LocalLoadOp and LocalAllocOp immediately after their operands. m.walk([&](Operation *op) { - if (!isa(op)) { + if (!isa(op) || + op->getNumOperands() < 1) { return; } - Operation *argOp = op->getOperand(0).getDefiningOp(); - if (!argOp) - return; - moveAfter(op, argOp); + if (Operation *argOp = op->getOperand(0).getDefiningOp()) + moveAfter(op, argOp); }); // Move transpositions just after their definition - opToMove.clear(); m.walk([&](triton::TransOp op) { Operation *argOp = op.getSrc().getDefiningOp(); if (!argOp) return; moveAfter(op, argOp); }); - return; + SmallVector moveOps; + m.walk([&](triton::gpu::LocalStoreOp op) { + // Move local stores early if it's global load is outside loop + moveOps.push_back(op); + }); + m.walk([&](triton::LoadOp op) { + // Move global loads early (prefetch) + moveOps.push_back(op); + }); + for (auto op : moveOps) { + // 0. gather DFG + Block *block = op->getBlock(); + SmallVector dfg{op}; + bool leadsToLoad = gatherDFG(op, block, dfg); + if (!isa(op) || !leadsToLoad) { + // 1. move to beginning of enclosing block + for (auto *op : dfg) + op->moveAfter(block, block->begin()); + } + } } }; From 047c2c146358f9eddd86745626a08390159131c1 Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 17 Jun 2024 16:17:54 +0000 Subject: [PATCH 03/36] * consolidated/fixed stream-pipeliner tests --- test/TritonGPU/amd/amd-loop-pipeline.mlir | 1636 ------------------ test/TritonGPU/amd/amd-stream-pipeline.mlir | 1670 ++++++++++++++++++- 2 files changed, 1631 insertions(+), 1675 deletions(-) delete mode 100644 test/TritonGPU/amd/amd-loop-pipeline.mlir diff --git a/test/TritonGPU/amd/amd-loop-pipeline.mlir b/test/TritonGPU/amd/amd-loop-pipeline.mlir deleted file mode 100644 index fbad2dc50ce5..000000000000 --- a/test/TritonGPU/amd/amd-loop-pipeline.mlir +++ /dev/null @@ -1,1636 +0,0 @@ -// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 | FileCheck %s - -// 4 warps -// matmul: 128x32 @ 32x128 -> 128x128 -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -// CHECK-LABEL: tt.func @matmul_loop -// CHECK: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_25]] -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[MULF_29:.*]] = arith.mulf %[[CONVERT_LAYOUT_28]], %{{.*}} -// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[MULF_29]], %[[ARG8]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] -// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } - -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] - -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { -tt.func @matmul_loop(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %b_scale = arith.constant dense<4.> : tensor<32x128xf16, #B> - - %loop:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - %b__ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b_ = triton_gpu.convert_layout %b__ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - %b = arith.mulf %b_, %b_scale: tensor<32x128xf16, #B> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - tt.return %loop#2: tensor<128x128xf32, #C> -} - -// CHECK-LABEL: tt.func @matmul_loop_nested -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] -// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) - -// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} -// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_26]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[CONVERT_LAYOUT_29]], %[[ARG10]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} -// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -// CHECK: scf.yield %{{.*}}#2 -// CHECK: } -tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ - - %c_start = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - %loop1:1 = scf.for %iv0 = %lb to %ub step %step iter_args(%c_init = %c_start) -> (tensor<128x128xf32, #C>) { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %loop2:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %a_ = tt.load %a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - - scf.yield %loop2#2 : tensor<128x128xf32, #C> - } - tt.return %loop1#0 : tensor<128x128xf32, #C> -} - -// CHECK-LABEL: tt.func @matmul_loop_single_pipeline -// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] -// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) -// CHECK: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] -// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] -// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] -// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[CONVERT_LAYOUT_24]], %[[ARG7]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} -// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} -// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] -// CHECK: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - - %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_b_ptr, %c : tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - tt.return %loop#1 : tensor<128x128xf32, #C> -} - -// CHECK-LABEL: tt.func @indirect_bmm_scalar -// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// CHECK: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] -// CHECK: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] -// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] -// CHECK: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] -// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] -// CHECK: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] -// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] -// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] -// CHECK: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] -// CHECK: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] -// CHECK: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] -// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] -// CHECK: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] -// CHECK: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] -// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) - -// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_30]] -// CHECK: %[[CONVERT_LAYOUT_33:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_31]] -// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[CONVERT_LAYOUT_33]], %[[ARG7]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] -// CHECK: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] -// CHECK: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] -// CHECK: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] -// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] -// CHECK: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] -// CHECK: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} -// CHECK: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] -// CHECK: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] -// CHECK: } - -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] - -tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: !tt.ptr, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : !tt.ptr - %84 = arith.muli %77, %83 : i64 - %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one -// CHECK: %[[LOAD_0:.*]] = tt.load %{{.*}} -// CHECK: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] -// CHECK: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] -// CHECK: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] -// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] -// CHECK: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] -// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] -// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] -// CHECK: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) - -// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] -// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] -// CHECK: %[[DOT_26:.*]] = tt.dot %[[CONVERT_LAYOUT_24]], %[[CONVERT_LAYOUT_25]], %[[ARG7]] -// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] -// CHECK: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] -// CHECK: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] -// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] -// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] - -tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: !tt.ptr, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %50 = tt.load %75 : !tt.ptr - %51 = tt.addptr %75, %c1_i32 : !tt.ptr, i32 - %79:4 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %51, %arg22 = %50) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : !tt.ptr - %84 = arith.muli %77, %arg22 : i64 - %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 - scf.yield %90, %91, %92, %83 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64 - } - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// CHECK-LABEL: tt.func @indirect_bmm_vector -// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// CHECK: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] -// CHECK: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] -// CHECK: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] -// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] -// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] - -tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// CHECK-LABEL: tt.func @post_load_inv -// CHECK: scf.for -// CHECK-DAG: %[[IV:.*]] = arith.index_cast -// CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 -// CHECK: arith.index_cast -// CHECK-NOT: arith.addi %[[NEXT_IV]] -tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}, - %arg3: i32 {tt.divisibility = 16 : i32}, - %arg4: i32 {tt.divisibility = 16 : i32}, - %arg5: i32 {tt.divisibility = 16 : i32}, - %arg6: i32 {tt.divisibility = 16 : i32}, - %arg7: i32 {tt.divisibility = 16 : i32}, - %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { - %c0_index = arith.constant 0 : index - %c1_index = arith.constant 1 : index - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %84 = arith.constant 900 : index - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> - %50 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> - %59 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %81 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %66 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> - %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %82 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %85:3 = scf.for %arg9 = %c0_index to %84 step %c1_index iter_args(%arg10 = %cst, %arg11 = %59, %arg12 = %81) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { - %130 = arith.index_cast %arg9 : index to i32 - %107 = arith.muli %130, %c32_i32 : i32 - %108 = arith.subi %arg5, %107 : i32 - %109 = tt.splat %108 : i32 -> tensor<1x32xi32, #AL> - %110 = arith.cmpi "slt", %50, %109 : tensor<1x32xi32, #AL> - %111 = tt.broadcast %110 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> - %112 = tt.load %arg11, %111, %cst_0 : tensor<32x32x!tt.ptr, #AL> - %113 = tt.splat %108 : i32 -> tensor<32x1xi32, #AL> - %114 = arith.cmpi "slt", %66, %113 : tensor<32x1xi32, #AL> - %115 = tt.broadcast %114 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> - %116 = tt.load %arg12, %115, %cst_0 : tensor<32x32x!tt.ptr, #AL> - %117 = triton_gpu.convert_layout %112 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> - %118 = triton_gpu.convert_layout %116 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> - %119 = tt.dot %117, %118, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> - %131 = arith.index_cast %arg9 : index to i32 - %120 = arith.addi %131, %c1_i32 : i32 - %121 = arith.muli %120, %c32_i32 : i32 - %122 = tt.splat %121 : i32 -> tensor<32x32xi32, #AL> - %123 = tt.addptr %60, %122 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - %124 = arith.muli %121, %arg7 : i32 - %125 = tt.splat %124 : i32 -> tensor<32x32xi32, #AL> - %126 = tt.addptr %82, %125 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - scf.yield %119, %123, %126 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> - } - tt.return %85#0 : tensor<32x32xf32, #C> -} - -// CHECK-LABEL: tt.func @cross_iter_dep -// TODO: enable pipelining with distance of 2 -// CHECK-NOT: triton_gpu.local_load -// CHECK: scf.for -// CHECK: scf.yield -tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}, - %arg3: i32 {tt.divisibility = 16 : i32}, - %arg4: i32 {tt.divisibility = 16 : i32}, - %arg5: i32 {tt.divisibility = 16 : i32}, - %arg6: i32 {tt.divisibility = 16 : i32}, - %arg7: i32 {tt.divisibility = 16 : i32}, - %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { - %c0_i32 = arith.constant 0 : index - %118 = arith.constant 32 : index - %c1_i32 = arith.constant 1 : index - %c2_i32 = arith.constant 2 : i32 - %c32_i32 = arith.constant 32 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> - %78 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %110 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %112 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %113 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %116 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %65 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> - %88 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> - %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %119:5 = scf.for %arg9 = %c0_i32 to %118 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %78, %arg12 = %110, %arg13 = %113, %arg14 = %116) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { - %161 = arith.index_cast %arg9 : index to i32 - %141 = arith.muli %161, %c32_i32 : i32 - %142 = arith.subi %arg5, %141 : i32 - %143 = tt.splat %142 : i32 -> tensor<1x32xi32, #AL> - %144 = arith.cmpi "slt", %65, %143 : tensor<1x32xi32, #AL> - %145 = tt.broadcast %144 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> - %146 = tt.load %arg11, %145, %cst_1 : tensor<32x32x!tt.ptr, #AL> - %147 = tt.splat %142 : i32 -> tensor<32x1xi32, #AL> - %148 = arith.cmpi "slt", %88, %147 : tensor<32x1xi32, #AL> - %149 = tt.broadcast %148 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> - %150 = tt.load %arg12, %149, %cst_1 : tensor<32x32x!tt.ptr, #AL> - %151 = triton_gpu.convert_layout %146 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> - %152 = triton_gpu.convert_layout %150 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> - %153 = tt.dot %151, %152, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> - %162 = arith.index_cast %arg9 : index to i32 - %154 = arith.addi %162, %c2_i32 : i32 - %155 = arith.muli %154, %c32_i32 : i32 - %156 = tt.splat %155 : i32 -> tensor<32x32xi32, #AL> - %157 = tt.addptr %80, %156 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - %158 = arith.muli %155, %arg7 : i32 - %159 = tt.splat %158 : i32 -> tensor<32x32xi32, #AL> - %160 = tt.addptr %112, %159 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - scf.yield %153, %arg13, %arg14, %157, %160 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> - } - tt.return %119#0 : tensor<32x32xf32, #C> -} - -// CHECK-LABEL: tt.func @dep_arg_two_uses -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims %arg5 -// CHECK-NEXT: tt.expand_dims %arg5 -// CHECK: %[[PTR0:.*]] = tt.splat %arg6 -// CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] -// CHECK-NEXT: tt.load %[[PTR1]] -tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - %23 = arith.constant 100 : index - %c64 = arith.constant 64 : i64 - %56 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %57 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %58 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %83 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %85 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> - %86 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> - %68 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %c32_index = arith.constant 32 : index - %c32_i32 = arith.index_cast %c32_index : index to i32 - %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #BL> - %88 = arith.truncf %cst_6 : tensor<32x128xf32, #BL> to tensor<32x128xf16, #BL> - %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #C> - %90 = tt.splat %c64 : i64 -> tensor<32x128xi64, #BL> - %92 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 - %c0_index = arith.constant 0 : index - %91:5 = scf.for %arg19 = %c0_index to %23 step %c32_index iter_args(%arg20 = %68, %arg21 = %83, %arg22 = %92, %arg23 = %cst, %arg24 = %80) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL>) { - %1750 = arith.subi %23, %arg19 : index - %175 = arith.index_cast %1750 : index to i32 - %176 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %177 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %178 = arith.cmpi "slt", %57, %176 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %179 = arith.cmpi "slt", %58, %177 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %180 = tt.expand_dims %178 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi1, #AL> - %181 = tt.expand_dims %179 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #BL}>> -> tensor<32x1xi1, #BL> - %182 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> - %183 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> - %184 = arith.extsi %182 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> - %185 = arith.extsi %183 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> - %186 = arith.muli %184, %85 : tensor<1x32xi64, #AL> - %187 = arith.muli %185, %86 : tensor<1x32xi64, #AL> - %188 = tt.broadcast %186 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> - %189 = tt.broadcast %187 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> - %190 = tt.addptr %arg20, %188 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> - %191 = tt.addptr %arg20, %189 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> - %192 = tt.broadcast %180 : tensor<1x32xi1, #AL> -> tensor<128x32xi1, #AL> - %193 = tt.load %191, %192 : tensor<128x32x!tt.ptr, #AL> - %194 = tt.splat %arg22 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %195 = tt.addptr %194, %56 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %196 = tt.load %195 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %197 = tt.addptr %arg22, %c32_i32 : !tt.ptr, i32 - %198 = tt.broadcast %181 : tensor<32x1xi1, #BL> -> tensor<32x128xi1, #BL> - %199 = tt.load %arg24, %198, %88 : tensor<32x128x!tt.ptr, #BL> - %200 = triton_gpu.convert_layout %193 : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> - %201 = triton_gpu.convert_layout %199 : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> - %202 = tt.dot %200, %201, %arg23 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> -> tensor<128x128xf32, #C> - %203 = tt.addptr %arg24, %90 : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi64, #BL> - scf.yield %190, %196, %197, %202, %203 : tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL> - } - tt.return %91#3 : tensor<128x128xf32, #C> -} -} // end module - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { -// CHECK-LABEL: tt.func @load_two_users - tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> - %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: triton_gpu.local_store - // CHECK: scf.for - // CHECK: tt.dot - // CHECK: tt.dot - // CHECK: tt.load - // CHECK: triton_gpu.local_store - // CHECK: scf.yield - - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { - %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } - tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { -// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts - tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> - %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK-NOT: triton_gpu.local_store - // CHECK: scf.for - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { - %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } - tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- - -// CHECK-LABEL: tt.func public @nested_loops -// CHECK: scf.for -// CHECK: triton_gpu.local_alloc -// CHECK-NOT: triton_gpu.local_alloc -// CHECK: scf.for -// CHECK: scf.yield -// CHECK-DIS: scf.yield -// -// The following code has the structure: -// -// ``` -// for { -// %a = load() -// for { -// %b = load() -// dot(%a, %b) -// } -// } -// ``` -// -// Only the outer for should be pipelined. The regression this tests -// causes an assertion to fail while pipelining the outer `for`, in -// particular while predicating the operations scheduled to be emitted -// in the prologue. -// -// We check that there is no allocation before the first occurrence of -// scf.for because that would mean that the first load `%a = load()` -// would be pipelined. -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %c10_i32 = arith.constant 10 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> - %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { - %9 = arith.muli %arg4, %c32_i32 : i32 - %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = arith.addi %10, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %13 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %14 = tt.expand_dims %12 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %15 = tt.broadcast %14 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %16 = tt.addptr %6, %15 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %17 = tt.load %16 : tensor<32x32x!tt.ptr, #blocked> - %18 = tt.expand_dims %13 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %19 = arith.muli %18, %cst_0 : tensor<32x1xi32, #blocked> - %20 = tt.addptr %7, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %21 = tt.broadcast %20 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %22 = tt.addptr %8, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %23 = tt.broadcast %22 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { - %24 = arith.muli %arg5, %c32_i32 : i32 - %25 = tt.splat %24 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %26 = arith.addi %25, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %28 = tt.broadcast %27 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %29 = tt.addptr %21, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %30 = tt.load %29 : tensor<32x32x!tt.ptr, #blocked> - %31 = triton_gpu.convert_layout %30 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %32 = triton_gpu.convert_layout %17 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %33 = tt.dot %31, %32, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %34 = tt.addptr %23, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %35 = triton_gpu.convert_layout %33 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %34, %35 : tensor<32x32x!tt.ptr, #blocked> - } - } - tt.return - } -} // end module - -// ----- - -// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de -// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> - %c64_i32 = arith.constant 64 : i32 - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c64_i32 : i32 - %2 = tt.get_program_id y : i32 - %3 = tt.load %arg3 : !tt.ptr - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> - %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> - %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> - %11 = arith.extsi %arg5 : i32 to i64 - %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> - %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> - %14 = arith.muli %2, %arg5 : i32 - %15 = arith.extsi %14 : i32 to i64 - %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> - %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> - %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> - %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> - %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> - %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> - %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> - %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> - %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> - %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> - %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> - %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> - %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> - %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> - %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> - %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> - %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> - %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> - %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> - %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> - %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> - %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> - %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> - %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> - %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> - %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> - %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> - %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> - %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> - %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> - %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> - %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> - %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> - %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { - %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> - %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> - %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> - %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - scf.yield %79 : tensor<64x32xf32, #mma> - } - %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> - %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> - %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> - %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> - %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> - %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> - tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> - tt.return - } -} // end module - -// ----- -// CHECK-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> -// CHECK-LABEL: tt.func @indirect_load_shared_layout -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// CHECK: } - -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { -tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} -} - - -// ----- - -// CHECK-LABEL: @kernel_yield_constant -// CHECK: tt.load -// CHECK: triton_gpu.memdesc_subview -// CHECK: triton_gpu.local_store -// CHECK: scf.for -// CHECK: tt.load -// CHECK: triton_gpu.memdesc_subview -// CHECK: triton_gpu.local_store -// CHECK: tt.return -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> - %c32_i32 = arith.constant 32 : i32 - %c31_i32 = arith.constant 31 : i32 - %cst_1 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %0 = tt.get_program_id x : i32 - %7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = arith.addi %arg4, %c31_i32 : i32 - %13 = arith.divsi %12, %c32_i32 : i32 - %14 = tt.expand_dims %7 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %22 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %34 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %42 = scf.for %arg7 = %c0_i32 to %13 step %c1_i32 iter_args(%arg8 = %cst) -> (tensor<32x32xf32, #mma>) : i32 { - %43 = arith.muli %arg7, %c32_i32 : i32 - %44 = arith.muli %43, %arg5 : i32 - %45 = tt.splat %44 : i32 -> tensor<32x32xi32, #blocked> - %46 = tt.addptr %22, %45 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %47 = arith.subi %arg4, %43 : i32 - %48 = tt.splat %47 : i32 -> tensor<32x1xi32, #blocked> - %49 = arith.cmpi slt, %14, %48 : tensor<32x1xi32, #blocked> - %50 = tt.broadcast %49 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> - %51 = tt.load %46, %50, %cst_0 : tensor<32x32x!tt.ptr, #blocked> - %52 = triton_gpu.convert_layout %51 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %53 = tt.dot %cst_1, %52, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %54 = triton_gpu.convert_layout %53 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %34, %54 : tensor<32x32x!tt.ptr, #blocked> - scf.yield %cst1 : tensor<32x32xf32, #mma> - } - tt.return - } -} - - -// ----- - -// CHECK-LABEL: tt.func public @add_kernel -// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} -// CHECK: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] -// CHECK: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} -// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} -// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] -// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] -// CHECK: scf.for -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 - %c0_i32 = arith.constant 0 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { - %7 = arith.addi %1, %arg4 : i32 - %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> - %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> - %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> - %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %12 = tt.load %11, %10 : tensor<1024x!tt.ptr, #blocked> - %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %14 = tt.load %13, %10 : tensor<1024x!tt.ptr, #blocked> - %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> - %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} - - -// ----- - -// CHECK-LABEL: tt.func public @nested_loops -// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc %[[LOAD_10]] -// CHECK: %[[TRANS_12:.*]] = tt.trans %[[LOCAL_ALLOC_11]] {order = array} -// CHECK: %[[LOCAL_LOAD_13:.*]] = triton_gpu.local_load %[[TRANS_12]] -// CHECK: %[[LOCAL_ALLOC_14:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) - -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} -// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG3]], %{{.*}} -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG5]] -// CHECK: %[[CONVERT_LAYOUT_23:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] -// CHECK: %[[DOT_24:.*]] = tt.dot %[[CONVERT_LAYOUT_23]], %[[LOCAL_LOAD_13]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[DOT_24]] -// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_25]] -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_27:.*]] = tt.load %{{.*}}, %[[SPLAT_26]] -// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG4]], %{{.*}} -// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} -// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%[[SELECT_30]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: scf.yield %[[SELECT_21]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_14]] - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> - %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> - %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> - %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> - %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> - %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> - %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> - %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> - %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> - scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> - %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> - %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %16 = tt.dot %15, %13, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> - %17 = triton_gpu.convert_layout %16 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> - tt.store %9, %17 : tensor<16x16x!tt.ptr, #blocked> - } - } - tt.return - } -} - -// ----- - -// This test triggered some failure in the verifier, so we only -// included a simple check for the kernel name. -// CHECK-LABEL: @load_convert_layout -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { -tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #BLs1> - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %cst_0 = arith.constant dense<2> : tensor<16xi32, #BLs1> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %15 = arith.cmpi slt, %1, %cst_0 : tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21, %15 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} -} - - -// ----- - -// This test captured some ICE in MatmulLoopPipeline pass, so we only -// included a simple check for the kernel name. -// CHECK-LABEL: @matmul_indirect_pipeline -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %4 = tt.broadcast %2 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> - %5 = tt.broadcast %3 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %6 = arith.addi %4, %5 : tensor<32x32xi32, #blocked> - %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %8 = tt.addptr %7, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %9 = tt.load %8 : tensor<32x32x!tt.ptr, #blocked> - %10 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %11 = tt.addptr %10, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %12 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %13 = tt.addptr %12, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %14 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %15 = tt.load %13 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %16 = tt.addptr %14, %15 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> - %19 = tt.broadcast %18 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> - %20 = arith.addf %9, %19 : tensor<32x32xf32, #blocked> - %21 = triton_gpu.convert_layout %9 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %22 = triton_gpu.convert_layout %20 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %23 = tt.dot %21, %22, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %24 = triton_gpu.convert_layout %23 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %11, %24 : tensor<32x32x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} - -// ----- - -// CHECK-LABEL: @dont_pipeline_128x1 -// CHECK-NOT: local_load{{.*}}128x1 -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c128_i32 = arith.constant 128 : i32 - %c0_i32 = arith.constant 0 : i32 - %c64_i32 = arith.constant 64 : i32 - %cst_4 = arith.constant dense<-1.000000e+30> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - - %99:1 = scf.for %arg25 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg31 = %cst_4) -> (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) : i32 { - %94 = tt.splat %arg6 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> - %151 = tt.load %94 : tensor<128x1x!tt.ptr, #blocked> - %161 = triton_gpu.convert_layout %151 : tensor<128x1xi32, #blocked> -> tensor<128x1xi32, #mma> - %162 = tt.broadcast %161 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> - %170 = arith.sitofp %162 : tensor<128x64xi32, #mma> to tensor<128x64xf32, #mma> - - %173 = "tt.reduce"(%170) <{axis = 1 : i32}> ({ - ^bb0(%arg33: f32, %arg34: f32): - %207 = arith.maxnumf %arg33, %arg34 : f32 - tt.reduce.return %207 : f32 - }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %175 = arith.maxnumf %arg31, %173 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - - %201 = arith.truncf %170 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> - %202 = triton_gpu.convert_layout %201 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - - %192 = arith.constant dense<0.> : tensor<128x64xf32, #mma> - %203 = arith.constant dense<0.> : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %204 = tt.dot %202, %203, %192 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - - scf.yield %175 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } - tt.return - } -} - -// ----- - -// Check that the dependencies across ops of different nesting does not cause crash or -// incorrect schedule that fails to pipeline. -// CHECK-LABEL: @matmul_nested_ops -// CHECK: triton_gpu.local_load - -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { -tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}, - %ext : index) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - - %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C>) { - %cnd = arith.cmpi slt, %iv, %ext : index - %inc_a_ptr = scf.if %cnd -> (tensor<128x32x!tt.ptr, #AL>) { - %a_ptr_ = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - scf.yield %a_ptr_ : tensor<128x32x!tt.ptr, #AL> - } else { - scf.yield %a_ptr : tensor<128x32x!tt.ptr, #AL> - } - %a_ = tt.load %inc_a_ptr : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %inc_a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - scf.yield %next_a_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C> - } - tt.return %loop#1: tensor<128x128xf32, #C> -} -} - -// ----- - -// Pipeline the if ops at the beginning and the end of the loop -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - // CHECK-LABEL: dot_prologue_epilogue - // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} - tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] - // CHECK-NOT load - // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // CHECK: scf.if %[[CND]] - // CHECK: dot - // CHECK: scf.if %[[CND]] - // CHECK: arith.mulf - // CHECK: scf.yield - // CHECK-NOT: tt.addptr - // CHECK: scf.yield - %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { - %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> - %cnd = arith.cmpi slt, %arg3, %ext : i32 - %inc_ptr = scf.if %cnd -> tensor<64x16x!tt.ptr, #blocked> { - %ptr = tt.addptr %arg5, %inc : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - scf.yield %ptr : tensor<64x16x!tt.ptr, #blocked> - } else { - scf.yield %arg5 : tensor<64x16x!tt.ptr, #blocked> - } - %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> - %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { - %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> - scf.yield %acc_zero : tensor<128x16xf32, #mma1> - } else { - scf.yield %acc : tensor<128x16xf32, #mma1> - } - %22 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %acc_, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> - } - tt.return %17#0 : tensor<128x16xf32, #mma1> - } -} - -// ----- - -// Verify that uses of the ops scheduled in partucular place of the loop (like epilogue if) are correctly scheduled too. -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - // CHECK-LABEL: pipeline_downstream_dependencies - // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} - tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst1 = arith.constant dense<1> : tensor<64x16xi32, #blocked> - %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] - // CHECK-NOT load - // CHECK: dot - // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // CHECK: %[[IFRET:.*]]:2 = scf.if %[[CND]] - // CHECK: arith.mulf - // CHECK: scf.yield - // CHECK: tt.addptr {{.*}}, %[[IFRET]]#1 - // CHECK: scf.yield - %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { - %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> - %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> - %cnd = arith.cmpi slt, %arg3, %ext : i32 - %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) { - %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> - scf.yield %acc_zero, %cst : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> - } else { - scf.yield %acc, %cst1 : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> - } - %22 = tt.addptr %arg5, %if_ret#1 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %if_ret#0, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> - } - tt.return %17#0 : tensor<128x16xf32, #mma1> - } -} - -// ----- - -// CHECK-LABEL: @masked_add_kernel -// CHECK: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: scf.for -// CHECK: arith.select -// CHECK: arith.select -// CHECK: arith.addf -// CHECK: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] - -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 - %c0_i32 = arith.constant 0 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { - %7 = arith.addi %1, %arg4 : i32 - %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> - %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> - %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> - %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %12 = tt.load %11, %10, %cst : tensor<1024x!tt.ptr, #blocked> - %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %14 = tt.load %13, %10, %cst : tensor<1024x!tt.ptr, #blocked> - %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> - %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index 4b2de3336413..fe2ea9da65a2 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -1,44 +1,1636 @@ -// RUN: triton-opt %s -split-input-file --tritonamdgpu-stream-pipeline | FileCheck %s - -// CHECK-LABEL: @check_stream_pipeline_epilogue -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 4], warpsPerCTA = [1, 1], order = [1, 0]}> -#mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, triton_gpu.target = "hip:gfx90a", "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @check_stream_pipeline_epilogue(%Aptr: tensor<32x32x!tt.ptr, #blocked>, %Bptr : tensor<32x32x!tt.ptr, #blocked>, %arg4 : i32, %arg5 : i1) { - %cst_0 = arith.constant dense<16> : tensor<32x32xi32, #blocked> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> - %cst_5 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - // CHECK: scf.for {{.*}} = %[[LB:.*]] to %[[UB:.*]] step %[[STEP:.*]] iter_args({{.*}}) - %36:3 = scf.for %arg9 = %c0_i32 to %arg4 step %c1_i32 iter_args(%arg10 = %cst_5, %arg12 = %Aptr, %arg13 = %Bptr) -> (tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked>, tensor<32x32x!tt.ptr, #blocked>) : i32 { - %61 = arith.muli %arg9, %arg4 : i32 - %62 = arith.cmpi slt, %arg4, %61 : i32 - %63 = tt.splat %62 : i1 -> tensor<32x32xi1, #blocked> - // This load will not be pipelined - %66 = tt.load %arg12, %63 : tensor<32x32x!tt.ptr, #blocked> - // This load will be pipelined - %70 = tt.load %arg13 : tensor<32x32x!tt.ptr, #blocked> - %71 = triton_gpu.convert_layout %66 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.convert_layout %70 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %73 = tt.dot %71, %72, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - // This scf.if will make load at %66 non-pipelineable - %74 = scf.if %arg5 -> (tensor<32x32xf32, #blocked>){ - scf.yield %66 : tensor<32x32xf32, #blocked> +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline=num_stages=2 | FileCheck %s + +// 4 warps +// matmul: 128x32 @ 32x128 -> 128x128 +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +// CHECK-LABEL: tt.func @matmul_loop +// CHECK: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] +// CHECK: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] +// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_25]] +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[MULF_29:.*]] = arith.mulf %[[CONVERT_LAYOUT_28]], %{{.*}} +// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[MULF_29]], %[[ARG8]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] +// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } + +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] + +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +tt.func @matmul_loop(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %b_scale = arith.constant dense<4.> : tensor<32x128xf16, #B> + + %loop:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + %b__ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b_ = triton_gpu.convert_layout %b__ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + %b = arith.mulf %b_, %b_scale: tensor<32x128xf16, #B> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + tt.return %loop#2: tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @matmul_loop_nested +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] +// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) + +// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] +// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} +// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_26]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[CONVERT_LAYOUT_29]], %[[ARG10]] +// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} +// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// CHECK: scf.yield %{{.*}}#2 +// CHECK: } +tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ + + %c_start = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + %loop1:1 = scf.for %iv0 = %lb to %ub step %step iter_args(%c_init = %c_start) -> (tensor<128x128xf32, #C>) { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %loop2:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %a_ = tt.load %a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + + scf.yield %loop2#2 : tensor<128x128xf32, #C> + } + tt.return %loop1#0 : tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @matmul_loop_single_pipeline +// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] +// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// CHECK: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] +// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] +// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[CONVERT_LAYOUT_24]], %[[ARG7]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} +// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} +// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] +// CHECK: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + + %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> + + %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + scf.yield %next_b_ptr, %c : tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> + } + tt.return %loop#1 : tensor<128x128xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_scalar +// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// CHECK: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] +// CHECK: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] +// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] +// CHECK: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] +// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] +// CHECK: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] +// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] +// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] +// CHECK: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] +// CHECK: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] +// CHECK: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] +// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] +// CHECK: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] +// CHECK: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] +// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) + +// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_30]] +// CHECK: %[[CONVERT_LAYOUT_33:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_31]] +// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[CONVERT_LAYOUT_33]], %[[ARG7]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] +// CHECK: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] +// CHECK: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] +// CHECK: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] +// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] +// CHECK: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] +// CHECK: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] +// CHECK: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} +// CHECK: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] +// CHECK: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// CHECK: } + +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + +tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: !tt.ptr, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : !tt.ptr + %84 = arith.muli %77, %83 : i64 + %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one +// CHECK: %[[LOAD_0:.*]] = tt.load %{{.*}} +// CHECK: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] +// CHECK: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] +// CHECK: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] +// CHECK: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] +// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] +// CHECK: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] +// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] +// CHECK: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] +// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] +// CHECK: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) + +// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] +// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] +// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] +// CHECK: %[[DOT_26:.*]] = tt.dot %[[CONVERT_LAYOUT_24]], %[[CONVERT_LAYOUT_25]], %[[ARG7]] +// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] +// CHECK: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// CHECK: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] +// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] +// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] + +tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: !tt.ptr, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %50 = tt.load %75 : !tt.ptr + %51 = tt.addptr %75, %c1_i32 : !tt.ptr, i32 + %79:4 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %51, %arg22 = %50) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : !tt.ptr + %84 = arith.muli %77, %arg22 : i64 + %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 + scf.yield %90, %91, %92, %83 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64 + } + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @indirect_bmm_vector +// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// CHECK: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] +// CHECK: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] +// CHECK: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] +// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] +// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] +// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + +tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} + +// CHECK-LABEL: tt.func @post_load_inv +// CHECK: scf.for +// CHECK-DAG: %[[IV:.*]] = arith.index_cast +// CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 +// CHECK: arith.index_cast +// CHECK-NOT: arith.addi %[[NEXT_IV]] +tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}, + %arg3: i32 {tt.divisibility = 16 : i32}, + %arg4: i32 {tt.divisibility = 16 : i32}, + %arg5: i32 {tt.divisibility = 16 : i32}, + %arg6: i32 {tt.divisibility = 16 : i32}, + %arg7: i32 {tt.divisibility = 16 : i32}, + %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { + %c0_index = arith.constant 0 : index + %c1_index = arith.constant 1 : index + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %84 = arith.constant 900 : index + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> + %50 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> + %59 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %81 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %66 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %82 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %85:3 = scf.for %arg9 = %c0_index to %84 step %c1_index iter_args(%arg10 = %cst, %arg11 = %59, %arg12 = %81) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { + %130 = arith.index_cast %arg9 : index to i32 + %107 = arith.muli %130, %c32_i32 : i32 + %108 = arith.subi %arg5, %107 : i32 + %109 = tt.splat %108 : i32 -> tensor<1x32xi32, #AL> + %110 = arith.cmpi "slt", %50, %109 : tensor<1x32xi32, #AL> + %111 = tt.broadcast %110 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> + %112 = tt.load %arg11, %111, %cst_0 : tensor<32x32x!tt.ptr, #AL> + %113 = tt.splat %108 : i32 -> tensor<32x1xi32, #AL> + %114 = arith.cmpi "slt", %66, %113 : tensor<32x1xi32, #AL> + %115 = tt.broadcast %114 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> + %116 = tt.load %arg12, %115, %cst_0 : tensor<32x32x!tt.ptr, #AL> + %117 = triton_gpu.convert_layout %112 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> + %118 = triton_gpu.convert_layout %116 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> + %119 = tt.dot %117, %118, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> + %131 = arith.index_cast %arg9 : index to i32 + %120 = arith.addi %131, %c1_i32 : i32 + %121 = arith.muli %120, %c32_i32 : i32 + %122 = tt.splat %121 : i32 -> tensor<32x32xi32, #AL> + %123 = tt.addptr %60, %122 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + %124 = arith.muli %121, %arg7 : i32 + %125 = tt.splat %124 : i32 -> tensor<32x32xi32, #AL> + %126 = tt.addptr %82, %125 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + scf.yield %119, %123, %126 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> + } + tt.return %85#0 : tensor<32x32xf32, #C> +} + +// CHECK-LABEL: tt.func @cross_iter_dep +// TODO: enable pipelining with distance of 2 +// CHECK-NOT: triton_gpu.local_load +// CHECK: scf.for +// CHECK: scf.yield +tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}, + %arg3: i32 {tt.divisibility = 16 : i32}, + %arg4: i32 {tt.divisibility = 16 : i32}, + %arg5: i32 {tt.divisibility = 16 : i32}, + %arg6: i32 {tt.divisibility = 16 : i32}, + %arg7: i32 {tt.divisibility = 16 : i32}, + %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { + %c0_i32 = arith.constant 0 : index + %118 = arith.constant 32 : index + %c1_i32 = arith.constant 1 : index + %c2_i32 = arith.constant 2 : i32 + %c32_i32 = arith.constant 32 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> + %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> + %78 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %110 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %112 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %113 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %116 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %65 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> + %88 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> + %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> + %119:5 = scf.for %arg9 = %c0_i32 to %118 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %78, %arg12 = %110, %arg13 = %113, %arg14 = %116) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { + %161 = arith.index_cast %arg9 : index to i32 + %141 = arith.muli %161, %c32_i32 : i32 + %142 = arith.subi %arg5, %141 : i32 + %143 = tt.splat %142 : i32 -> tensor<1x32xi32, #AL> + %144 = arith.cmpi "slt", %65, %143 : tensor<1x32xi32, #AL> + %145 = tt.broadcast %144 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> + %146 = tt.load %arg11, %145, %cst_1 : tensor<32x32x!tt.ptr, #AL> + %147 = tt.splat %142 : i32 -> tensor<32x1xi32, #AL> + %148 = arith.cmpi "slt", %88, %147 : tensor<32x1xi32, #AL> + %149 = tt.broadcast %148 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> + %150 = tt.load %arg12, %149, %cst_1 : tensor<32x32x!tt.ptr, #AL> + %151 = triton_gpu.convert_layout %146 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> + %152 = triton_gpu.convert_layout %150 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> + %153 = tt.dot %151, %152, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> + %162 = arith.index_cast %arg9 : index to i32 + %154 = arith.addi %162, %c2_i32 : i32 + %155 = arith.muli %154, %c32_i32 : i32 + %156 = tt.splat %155 : i32 -> tensor<32x32xi32, #AL> + %157 = tt.addptr %80, %156 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + %158 = arith.muli %155, %arg7 : i32 + %159 = tt.splat %158 : i32 -> tensor<32x32xi32, #AL> + %160 = tt.addptr %112, %159 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> + scf.yield %153, %arg13, %arg14, %157, %160 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> + } + tt.return %119#0 : tensor<32x32xf32, #C> +} + +// CHECK-LABEL: tt.func @dep_arg_two_uses +// CHECK: tt.expand_dims +// CHECK: tt.expand_dims +// CHECK: tt.expand_dims %arg5 +// CHECK-NEXT: tt.expand_dims %arg5 +// CHECK: %[[PTR0:.*]] = tt.splat %arg6 +// CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] +// CHECK-NEXT: tt.load %[[PTR1]] +tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, + %arg1: !tt.ptr {tt.divisibility = 16 : i32}, + %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { + %23 = arith.constant 100 : index + %c64 = arith.constant 64 : i64 + %56 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %57 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %58 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %83 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %85 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> + %86 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> + %68 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %c32_index = arith.constant 32 : index + %c32_i32 = arith.index_cast %c32_index : index to i32 + %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #BL> + %88 = arith.truncf %cst_6 : tensor<32x128xf32, #BL> to tensor<32x128xf16, #BL> + %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #C> + %90 = tt.splat %c64 : i64 -> tensor<32x128xi64, #BL> + %92 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 + %c0_index = arith.constant 0 : index + %91:5 = scf.for %arg19 = %c0_index to %23 step %c32_index iter_args(%arg20 = %68, %arg21 = %83, %arg22 = %92, %arg23 = %cst, %arg24 = %80) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL>) { + %1750 = arith.subi %23, %arg19 : index + %175 = arith.index_cast %1750 : index to i32 + %176 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %177 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %178 = arith.cmpi "slt", %57, %176 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %179 = arith.cmpi "slt", %58, %177 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> + %180 = tt.expand_dims %178 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi1, #AL> + %181 = tt.expand_dims %179 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #BL}>> -> tensor<32x1xi1, #BL> + %182 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> + %183 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> + %184 = arith.extsi %182 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> + %185 = arith.extsi %183 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> + %186 = arith.muli %184, %85 : tensor<1x32xi64, #AL> + %187 = arith.muli %185, %86 : tensor<1x32xi64, #AL> + %188 = tt.broadcast %186 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> + %189 = tt.broadcast %187 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> + %190 = tt.addptr %arg20, %188 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> + %191 = tt.addptr %arg20, %189 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> + %192 = tt.broadcast %180 : tensor<1x32xi1, #AL> -> tensor<128x32xi1, #AL> + %193 = tt.load %191, %192 : tensor<128x32x!tt.ptr, #AL> + %194 = tt.splat %arg22 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %195 = tt.addptr %194, %56 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %196 = tt.load %195 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> + %197 = tt.addptr %arg22, %c32_i32 : !tt.ptr, i32 + %198 = tt.broadcast %181 : tensor<32x1xi1, #BL> -> tensor<32x128xi1, #BL> + %199 = tt.load %arg24, %198, %88 : tensor<32x128x!tt.ptr, #BL> + %200 = triton_gpu.convert_layout %193 : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> + %201 = triton_gpu.convert_layout %199 : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> + %202 = tt.dot %200, %201, %arg23 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> -> tensor<128x128xf32, #C> + %203 = tt.addptr %arg24, %90 : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi64, #BL> + scf.yield %190, %196, %197, %202, %203 : tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL> + } + tt.return %91#3 : tensor<128x128xf32, #C> +} +} // end module + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +// CHECK-LABEL: tt.func @load_two_users + tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: triton_gpu.local_store + // CHECK: scf.for + // CHECK: tt.dot + // CHECK: tt.dot + // CHECK: tt.load + // CHECK: triton_gpu.local_store + // CHECK: scf.yield + + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts + tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK-NOT: triton_gpu.local_store + // CHECK: scf.for + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: scf.for +// CHECK: triton_gpu.local_alloc +// CHECK-NOT: triton_gpu.local_alloc +// CHECK: scf.for +// CHECK: scf.yield +// CHECK-DIS: scf.yield +// +// The following code has the structure: +// +// ``` +// for { +// %a = load() +// for { +// %b = load() +// dot(%a, %b) +// } +// } +// ``` +// +// Only the outer for should be pipelined. The regression this tests +// causes an assertion to fail while pipelining the outer `for`, in +// particular while predicating the operations scheduled to be emitted +// in the prologue. +// +// We check that there is no allocation before the first occurrence of +// scf.for because that would mean that the first load `%a = load()` +// would be pipelined. +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %c32_i32 = arith.constant 32 : i32 + %c10_i32 = arith.constant 10 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> + %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { + %9 = arith.muli %arg4, %c32_i32 : i32 + %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = arith.addi %10, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %13 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %14 = tt.expand_dims %12 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %15 = tt.broadcast %14 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %16 = tt.addptr %6, %15 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %17 = tt.load %16 : tensor<32x32x!tt.ptr, #blocked> + %18 = tt.expand_dims %13 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %19 = arith.muli %18, %cst_0 : tensor<32x1xi32, #blocked> + %20 = tt.addptr %7, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %21 = tt.broadcast %20 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + %22 = tt.addptr %8, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> + %23 = tt.broadcast %22 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> + scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { + %24 = arith.muli %arg5, %c32_i32 : i32 + %25 = tt.splat %24 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %26 = arith.addi %25, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %28 = tt.broadcast %27 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %29 = tt.addptr %21, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %30 = tt.load %29 : tensor<32x32x!tt.ptr, #blocked> + %31 = triton_gpu.convert_layout %30 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %32 = triton_gpu.convert_layout %17 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %33 = tt.dot %31, %32, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %34 = tt.addptr %23, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %35 = triton_gpu.convert_layout %33 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %34, %35 : tensor<32x32x!tt.ptr, #blocked> + } + } + tt.return + } +} // end module + +// ----- + +// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de +// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.get_program_id y : i32 + %3 = tt.load %arg3 : !tt.ptr + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> + %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> + %11 = arith.extsi %arg5 : i32 to i64 + %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> + %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> + %14 = arith.muli %2, %arg5 : i32 + %15 = arith.extsi %14 : i32 to i64 + %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> + %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> + %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> + %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> + %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> + %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> + %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> + %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> + %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> + %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> + %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> + %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> + %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> + %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> + %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> + %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> + %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> + %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> + %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> + %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> + %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> + %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> + %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> + %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> + %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> + %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> + %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> + %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> + %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> + %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> + %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> + %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { + %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> + %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> + %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> + %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> + %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + scf.yield %79 : tensor<64x32xf32, #mma> + } + %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> + %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> + %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> + %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> + %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> + %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> + tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> + tt.return + } +} // end module + +// ----- +// CHECK-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> +// CHECK-LABEL: tt.func @indirect_load_shared_layout +// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] +// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// CHECK: } + +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} +} + + +// ----- + +// CHECK-LABEL: @kernel_yield_constant +// CHECK: tt.load +// CHECK: triton_gpu.memdesc_subview +// CHECK: triton_gpu.local_store +// CHECK: scf.for +// CHECK: tt.load +// CHECK: triton_gpu.memdesc_subview +// CHECK: triton_gpu.local_store +// CHECK: tt.return +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> + %c0_i32 = arith.constant 0 : i32 + %c1_i32 = arith.constant 1 : i32 + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> + %c32_i32 = arith.constant 32 : i32 + %c31_i32 = arith.constant 31 : i32 + %cst_1 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %0 = tt.get_program_id x : i32 + %7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %12 = arith.addi %arg4, %c31_i32 : i32 + %13 = arith.divsi %12, %c32_i32 : i32 + %14 = tt.expand_dims %7 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %22 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %34 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %42 = scf.for %arg7 = %c0_i32 to %13 step %c1_i32 iter_args(%arg8 = %cst) -> (tensor<32x32xf32, #mma>) : i32 { + %43 = arith.muli %arg7, %c32_i32 : i32 + %44 = arith.muli %43, %arg5 : i32 + %45 = tt.splat %44 : i32 -> tensor<32x32xi32, #blocked> + %46 = tt.addptr %22, %45 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %47 = arith.subi %arg4, %43 : i32 + %48 = tt.splat %47 : i32 -> tensor<32x1xi32, #blocked> + %49 = arith.cmpi slt, %14, %48 : tensor<32x1xi32, #blocked> + %50 = tt.broadcast %49 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> + %51 = tt.load %46, %50, %cst_0 : tensor<32x32x!tt.ptr, #blocked> + %52 = triton_gpu.convert_layout %51 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %53 = tt.dot %cst_1, %52, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %54 = triton_gpu.convert_layout %53 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %34, %54 : tensor<32x32x!tt.ptr, #blocked> + scf.yield %cst1 : tensor<32x32xf32, #mma> + } + tt.return + } +} + + +// ----- + +// CHECK-LABEL: tt.func public @add_kernel +// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} +// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// CHECK: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} +// CHECK: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} +// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] +// CHECK: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} +// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} +// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] +// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] +// CHECK: scf.for +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { + %7 = arith.addi %1, %arg4 : i32 + %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> + %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> + %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> + %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %10 : tensor<1024x!tt.ptr, #blocked> + %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %14 = tt.load %13, %10 : tensor<1024x!tt.ptr, #blocked> + %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> + %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} + tt.return + } +} + + +// ----- + +// CHECK-LABEL: tt.func public @nested_loops +// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} +// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc %[[LOAD_10]] +// CHECK: %[[TRANS_12:.*]] = tt.trans %[[LOCAL_ALLOC_11]] {order = array} +// CHECK: %[[LOCAL_LOAD_13:.*]] = triton_gpu.local_load %[[TRANS_12]] +// CHECK: %[[LOCAL_ALLOC_14:.*]] = triton_gpu.local_alloc +// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%{{.*}}, %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) + +// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} +// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG3]], %{{.*}} +// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG5]] +// CHECK: %[[CONVERT_LAYOUT_23:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] +// CHECK: %[[DOT_24:.*]] = tt.dot %[[CONVERT_LAYOUT_23]], %[[LOCAL_LOAD_13]], %{{.*}} +// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[DOT_24]] +// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_25]] +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_18]] +// CHECK: %[[LOAD_27:.*]] = tt.load %{{.*}}, %[[SPLAT_26]] +// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG4]], %{{.*}} +// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} +// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%[[SELECT_30]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: scf.yield %[[SELECT_21]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] +// CHECK: } +// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_14]] + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> + %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> + %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> + %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> + %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> + %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> + %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> + %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> + %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> + scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> + %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> + %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> + %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %16 = tt.dot %15, %13, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> + %17 = triton_gpu.convert_layout %16 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> + tt.store %9, %17 : tensor<16x16x!tt.ptr, #blocked> + } + } + tt.return + } +} + +// ----- + +// This test triggered some failure in the verifier, so we only +// included a simple check for the kernel name. +// CHECK-LABEL: @load_convert_layout +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, + %76: index, + %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, + %75: tensor<16x!tt.ptr, #BLs1>, + %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, + %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ + %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #BLs1> + %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> + %cst_0 = arith.constant dense<2> : tensor<16xi32, #BLs1> + %c4_i32 = arith.constant 4 : i32 + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c0_i64 = arith.constant 0 : i64 + %c1_i32 = arith.constant 1 : i32 + %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> + %15 = arith.cmpi slt, %1, %cst_0 : tensor<16xi32, #BLs1> + %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { + %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> + %83 = tt.load %arg21, %15 : tensor<16x!tt.ptr, #BLs1> + %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> + %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> + %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> + %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> + %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> + %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> + %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> + %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> + %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> + %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> + scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> + } {tt.num_stages = 3 : i32} + tt.return %79#0 : tensor<16x16xf32, #C> +} +} + + +// ----- + +// This test captured some ICE in MatmulLoopPipeline pass, so we only +// included a simple check for the kernel name. +// CHECK-LABEL: @matmul_indirect_pipeline +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c2_i32 = arith.constant 2 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> + %3 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %4 = tt.broadcast %2 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> + %5 = tt.broadcast %3 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> + %6 = arith.addi %4, %5 : tensor<32x32xi32, #blocked> + %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %8 = tt.addptr %7, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %9 = tt.load %8 : tensor<32x32x!tt.ptr, #blocked> + %10 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> + %11 = tt.addptr %10, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> + %12 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %13 = tt.addptr %12, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %14 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { + %15 = tt.load %13 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %16 = tt.addptr %14, %15 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> + %19 = tt.broadcast %18 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> + %20 = arith.addf %9, %19 : tensor<32x32xf32, #blocked> + %21 = triton_gpu.convert_layout %9 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %22 = triton_gpu.convert_layout %20 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %23 = tt.dot %21, %22, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> + %24 = triton_gpu.convert_layout %23 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> + tt.store %11, %24 : tensor<32x32x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} + tt.return + } +} + +// ----- + +// CHECK-LABEL: @dont_pipeline_128x1 +// CHECK-NOT: local_load{{.*}}128x1 +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c128_i32 = arith.constant 128 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %cst_4 = arith.constant dense<-1.000000e+30> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + + %99:1 = scf.for %arg25 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg31 = %cst_4) -> (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) : i32 { + %94 = tt.splat %arg6 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> + %151 = tt.load %94 : tensor<128x1x!tt.ptr, #blocked> + %161 = triton_gpu.convert_layout %151 : tensor<128x1xi32, #blocked> -> tensor<128x1xi32, #mma> + %162 = tt.broadcast %161 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> + %170 = arith.sitofp %162 : tensor<128x64xi32, #mma> to tensor<128x64xf32, #mma> + + %173 = "tt.reduce"(%170) <{axis = 1 : i32}> ({ + ^bb0(%arg33: f32, %arg34: f32): + %207 = arith.maxnumf %arg33, %arg34 : f32 + tt.reduce.return %207 : f32 + }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + %175 = arith.maxnumf %arg31, %173 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + + %201 = arith.truncf %170 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> + %202 = triton_gpu.convert_layout %201 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + + %192 = arith.constant dense<0.> : tensor<128x64xf32, #mma> + %203 = arith.constant dense<0.> : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %204 = tt.dot %202, %203, %192 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + + scf.yield %175 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + } + tt.return + } +} + +// ----- + +// Check that the dependencies across ops of different nesting does not cause crash or +// incorrect schedule that fails to pipeline. +// CHECK-LABEL: @matmul_nested_ops +// CHECK: triton_gpu.local_load + +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> +#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> +#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> +#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> +#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> +#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> + +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, + %A : !tt.ptr {tt.divisibility = 16 : i32}, + %B : !tt.ptr {tt.divisibility = 16 : i32}, + %ext : index) -> tensor<128x128xf32, #C> { + // A ptrs + %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> + %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> + %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> + %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> + %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + // B ptrs + %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> + %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> + %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> + %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> + %b_ptr = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> + + %a_mask = arith.constant dense : tensor<128x32xi1, #AL> + %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> + %b_mask = arith.constant dense : tensor<32x128xi1, #BL> + %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> + %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> + + %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> + + %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> + %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> + + %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C>) { + %cnd = arith.cmpi slt, %iv, %ext : index + %inc_a_ptr = scf.if %cnd -> (tensor<128x32x!tt.ptr, #AL>) { + %a_ptr_ = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + scf.yield %a_ptr_ : tensor<128x32x!tt.ptr, #AL> + } else { + scf.yield %a_ptr : tensor<128x32x!tt.ptr, #AL> + } + %a_ = tt.load %inc_a_ptr : tensor<128x32x!tt.ptr, #AL> + %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> + + %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> + + %next_a_ptr = tt.addptr %inc_a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> + scf.yield %next_a_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C> + } + tt.return %loop#1: tensor<128x128xf32, #C> +} +} + +// ----- + +// Pipeline the if ops at the beginning and the end of the loop +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: dot_prologue_epilogue + // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: %[[C0:.*]] = arith.constant 0 : i32 + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] + // CHECK-NOT load + // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // CHECK: scf.if %[[CND]] + // CHECK: dot + // CHECK: scf.if %[[CND]] + // CHECK: arith.mulf + // CHECK: scf.yield + // CHECK-NOT: tt.addptr + // CHECK: scf.yield + %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { + %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> + %cnd = arith.cmpi slt, %arg3, %ext : i32 + %inc_ptr = scf.if %cnd -> tensor<64x16x!tt.ptr, #blocked> { + %ptr = tt.addptr %arg5, %inc : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + scf.yield %ptr : tensor<64x16x!tt.ptr, #blocked> + } else { + scf.yield %arg5 : tensor<64x16x!tt.ptr, #blocked> + } + %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { + %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> + scf.yield %acc_zero : tensor<128x16xf32, #mma1> + } else { + scf.yield %acc : tensor<128x16xf32, #mma1> + } + %22 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + scf.yield %acc_, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> + } + tt.return %17#0 : tensor<128x16xf32, #mma1> + } +} + +// ----- + +// Verify that uses of the ops scheduled in partucular place of the loop (like epilogue if) are correctly scheduled too. +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> +#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> +module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + // CHECK-LABEL: pipeline_downstream_dependencies + // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { + %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> + %cst1 = arith.constant dense<1> : tensor<64x16xi32, #blocked> + %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> + %c0_i32 = arith.constant 0 : i32 + %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: %[[C0:.*]] = arith.constant 0 : i32 + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] + // CHECK-NOT load + // CHECK: dot + // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // CHECK: %[[IFRET:.*]]:2 = scf.if %[[CND]] + // CHECK: arith.mulf + // CHECK: scf.yield + // CHECK: tt.addptr {{.*}}, %[[IFRET]]#1 + // CHECK: scf.yield + %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { + %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> + %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> + %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> + %cnd = arith.cmpi slt, %arg3, %ext : i32 + %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) { + %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> + scf.yield %acc_zero, %cst : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> } else { - scf.yield %cst_2: tensor<32x32xf32, #blocked> + scf.yield %acc, %cst1 : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> } - %75 = tt.addptr %arg12, %cst_0 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %76 = tt.addptr %arg13, %cst_0 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - scf.yield %73, %75, %76 : tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked>, tensor<32x32x!tt.ptr, #blocked> - } - // CHECK: %[[C1:.*]] = arith.constant 1 : i32 - // CHECK: %[[t0:.*]] = arith.subi %[[UB:.*]], %[[C1]] - // CHECK: %[[t1:.*]] = arith.subi %[[t0]], %[[LB]] - // CHECK: %[[t2:.*]] = arith.divui %[[t1]], %[[STEP]] - // CHECK: %[[t3:.*]] = arith.muli %[[t2]], %[[STEP]] - // CHECK: %[[PPLUB:.*]] = arith.addi %[[LB]], %[[t3]] - // CHECK: arith.muli %[[PPLUB]], {{.*}} + %22 = tt.addptr %arg5, %if_ret#1 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + scf.yield %if_ret#0, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> + } + tt.return %17#0 : tensor<128x16xf32, #mma1> + } +} + +// ----- + +// CHECK-LABEL: @masked_add_kernel +// CHECK: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: scf.for +// CHECK: arith.select +// CHECK: arith.select +// CHECK: arith.addf +// CHECK: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// CHECK: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] + +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 + %c0_i32 = arith.constant 0 : i32 + %c1016800_i32 = arith.constant 1016800 : i32 + %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c1016800_i32 : i32 + %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> + %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> + %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> + scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { + %7 = arith.addi %1, %arg4 : i32 + %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> + %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> + %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> + %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %12 = tt.load %11, %10, %cst : tensor<1024x!tt.ptr, #blocked> + %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + %14 = tt.load %13, %10, %cst : tensor<1024x!tt.ptr, #blocked> + %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> + %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> + tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> + } {tt.num_stages = 3 : i32} tt.return } } From 989150f127bcfc494dc87bcec45606b58ed66925 Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 17 Jun 2024 20:53:04 +0000 Subject: [PATCH 04/36] * updated test --- test/TritonGPU/amd/amd-stream-pipeline.mlir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index fe2ea9da65a2..7ac05abfb222 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -1034,9 +1034,9 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} // CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} // CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] // CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] // CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] // CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] // CHECK: } From 50914163a5f9e7830c8ef6e7cd3181f436063d8f Mon Sep 17 00:00:00 2001 From: SJW Date: Thu, 20 Jun 2024 17:15:45 +0000 Subject: [PATCH 05/36] * Find insertion point for loads/local_stores as early as possible - check for last atomic (sync?) - also check for other accesses to the source --- .../ReorderInstructions.cpp | 52 +++++++++++++++++-- .../TritonAMDGPUTransforms/StreamPipeline.cpp | 13 +++-- 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index a7d7a9783c7b..00107fbc2e31 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -79,6 +79,41 @@ static bool gatherDFG(Operation *op, Block *block, return leadsToLoad; } +static bool hasAtomic(Operation *op) { + if (isa(op)) + return true; + for (auto &subregion : op->getRegions()) { + for (auto &subblock : subregion) { + for (auto &sop : subblock) { + if (hasAtomic(&sop)) + return true; + } + } + } + return false; +} + +static llvm::ilist::iterator findEarlyLocation( + Block *block, Operation *op, Value src) { + auto loc = block->begin(); + for (auto bi = block->begin(); bi != block->end(); ++bi) { + auto *bop = &*bi; + if (bop == op) // don't move later than current location + break; + if (src) { + // check for ops accessing src + for (auto opr : op->getOperands()) { + if (opr == src) + loc = bi; + } + } + // atomics used for syncronization? + if (hasAtomic(bop)) + loc = bi; + } + return loc; +} + class TritonAMDGPUReorderInstructionsPass : public TritonAMDGPUReorderInstructionsBase< TritonAMDGPUReorderInstructionsPass> { @@ -125,12 +160,13 @@ class TritonAMDGPUReorderInstructionsPass moveAfter(op, argOp); }); SmallVector moveOps; + // Move local stores early if it's global load is outside loop m.walk([&](triton::gpu::LocalStoreOp op) { - // Move local stores early if it's global load is outside loop moveOps.push_back(op); }); + // Move global loads early (prefetch) + // - these should be moved last m.walk([&](triton::LoadOp op) { - // Move global loads early (prefetch) moveOps.push_back(op); }); for (auto op : moveOps) { @@ -139,9 +175,17 @@ class TritonAMDGPUReorderInstructionsPass SmallVector dfg{op}; bool leadsToLoad = gatherDFG(op, block, dfg); if (!isa(op) || !leadsToLoad) { + Value src; + if (auto ld = dyn_cast(op)) + src = ld.getPtr(); + // 0. find earliest insertion point + auto loc = findEarlyLocation(block, op, src); // 1. move to beginning of enclosing block - for (auto *op : dfg) - op->moveAfter(block, block->begin()); + for (auto *op : dfg) { + // only move up (not down) + if (loc->isBeforeInBlock(op)) + op->moveAfter(block, loc); + } } } } diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index 19f8eee829fe..fbdcb99b857a 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -105,13 +105,6 @@ static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, tt::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); copyOffsets[0] = insertIdx; - Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - tt::MemDescType subviewTy = tt::MemDescType::get( - allocTy.getShape().drop_front(), allocTy.getElementType(), - allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); - auto view = - builder.create(loc, subviewTy, alloc, copyOffsets); Operation *copy = builder.clone(*loadOp); auto [stage, cluster] = schedule[loadOp]; @@ -121,11 +114,17 @@ static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, // Extract part. SmallVector loadOffsets(allocTy.getRank(), zero); loadOffsets[0] = extractIdx; + Attribute sharedMemorySpace = + triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + tt::MemDescType subviewTy = tt::MemDescType::get( + allocTy.getShape().drop_front(), allocTy.getElementType(), + allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); auto viewLoad = builder.create(loc, subviewTy, alloc, loadOffsets); Operation *lds_store = builder.create(loc, copy->getResult(0), viewLoad); { + // Clean up old local caches. SmallVector allocsToErase; for (Operation *user : loadOp->getUsers()) { if (auto alloc = dyn_cast(user)) { From d42830ba2cf1fbdb514bea9a2de0b6be42c60701 Mon Sep 17 00:00:00 2001 From: SJW Date: Tue, 25 Jun 2024 19:07:10 +0000 Subject: [PATCH 06/36] * Reorder with BFS to keep relative order. --- .../amd/amd-reorder-instructions.mlir | 209 ++++++++---------- .../ReorderInstructions.cpp | 130 +++++------ 2 files changed, 144 insertions(+), 195 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 3b332c8a4148..e2870515654e 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -25,6 +25,41 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war } // ----- +// Move loads (and independent local_stores) as early as possible. +// These tests are generated by Stream Pipelining tests from amd-stream-pipeline.mlir. +// For example in the matmul_loop below, the scf.for loop looks like this after pipeliner: +// scf.for ... { +// // stage 1 +// %a = tt.local_load %a_tile +// %b = tt.local_load %b_tile +// tt.dot %c, %a, %b +// // stage 0 +// %aptr = tt.addptr %aptr, %k +// %a_next = tt.load %aptr +// %bptr = tt.addptr %bptr, %k +// %b_next = tt.load %bptr +// tt.local_store %a_next +// tt.local_store %b_next +// yield +// } +// +// Should convert to : +// scf.for ... { +// // stage 0.a +// %aptr = tt.addptr %aptr, %k +// %a_next = tt.load %aptr +// %bptr = tt.addptr %bptr, %k +// %b_next = tt.load %bptr +// // stage 1 +// %a = tt.local_load %a_tile +// %b = tt.local_load %b_tile +// tt.dot %c, %a, %b +// // stage 0.b +// tt.local_store %a_next +// tt.local_store %b_next +// yield +// } + #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> @@ -126,18 +161,18 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[FOR_0:.*]] = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}) // CHECK: %[[SPLAT_1:.*]] = tt.splat %{{.*}} -// CHECK: %[[CMPI_2:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[MAKE_RANGE_3:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} -// CHECK: %[[EXPAND_DIMS_4:.*]] = tt.expand_dims %[[MAKE_RANGE_3]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_5:.*]] = tt.broadcast %[[EXPAND_DIMS_4]] -// CHECK: %[[SPLAT_6:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[MAKE_RANGE_2:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} +// CHECK: %[[EXPAND_DIMS_3:.*]] = tt.expand_dims %[[MAKE_RANGE_2]] {axis = 0 : i32} +// CHECK: %[[CMPI_4:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// CHECK: %[[BROADCAST_5:.*]] = tt.broadcast %[[EXPAND_DIMS_3]] +// CHECK: %[[SPLAT_6:.*]] = tt.splat %[[CMPI_4]] // CHECK: %[[ADDPTR_7:.*]] = tt.addptr %[[SPLAT_1]], %[[BROADCAST_5]] // CHECK: %[[LOAD_8:.*]] = tt.load %[[ADDPTR_7]], %[[SPLAT_6]], %{{.*}} // CHECK: %[[MAKE_RANGE_9:.*]] = tt.make_range {end = 128 : i32, start = 0 : i32} // CHECK: %[[EXPAND_DIMS_10:.*]] = tt.expand_dims %[[MAKE_RANGE_9]] {axis = 0 : i32} // CHECK: %[[BROADCAST_11:.*]] = tt.broadcast %[[EXPAND_DIMS_10]] // CHECK: %[[SPLAT_12:.*]] = tt.splat %{{.*}} -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_4]] // CHECK: %[[ADDPTR_14:.*]] = tt.addptr %[[SPLAT_12]], %[[BROADCAST_11]] // CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]], %[[SPLAT_13]], %{{.*}} // CHECK: %[[LOCAL_ALLOC_16:.*]] = triton_gpu.local_alloc @@ -171,6 +206,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_39]] // CHECK: scf.yield %[[ADDPTR_24]], %[[ADDPTR_27]], %[[DOT_34]], %[[SELECT_31]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] // CHECK: } + // CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_16]] // CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_17]] // CHECK: scf.yield %{{.*}}#2 @@ -307,6 +343,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return %16#1 : tensor<128x128xf32, #mma> } +// This example tests that tt.load overlaps with independent ttg.local_store which +// overlaps with independent tt.dot. + // CHECK-LABEL: tt.func @indirect_bmm_scalar // CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}, %[[ARG15:.*]] = %{{.*}}) @@ -605,24 +644,24 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[INDEX_CAST_21:.*]] = arith.index_cast %[[ADDI_20]] // CHECK: %[[MULI_22:.*]] = arith.muli %[[INDEX_CAST_21]], %{{.*}} // CHECK: %[[SUBI_23:.*]] = arith.subi %{{.*}}, %[[MULI_22]] -// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[SUBI_23]] -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_24]] -// CHECK: %[[BROADCAST_26:.*]] = tt.broadcast %[[CMPI_25]] -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[INDEX_CAST_28:.*]] = arith.index_cast %[[ARG9]] -// CHECK: %[[ADDI_29:.*]] = arith.addi %[[INDEX_CAST_28]], %{{.*}} -// CHECK: %[[MULI_30:.*]] = arith.muli %[[ADDI_29]], %{{.*}} -// CHECK: %[[SPLAT_31:.*]] = tt.splat %[[MULI_30]] -// CHECK: %[[ANDI_32:.*]] = arith.andi %[[SPLAT_27]], %[[BROADCAST_26]] +// CHECK: %[[INDEX_CAST_24:.*]] = arith.index_cast %[[ARG9]] +// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[SUBI_23]] +// CHECK: %[[ADDI_26:.*]] = arith.addi %[[INDEX_CAST_24]], %{{.*}} +// CHECK: %[[CMPI_27:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_25]] +// CHECK: %[[MULI_28:.*]] = arith.muli %[[ADDI_26]], %{{.*}} +// CHECK: %[[BROADCAST_29:.*]] = tt.broadcast %[[CMPI_27]] +// CHECK: %[[SPLAT_30:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[SPLAT_31:.*]] = tt.splat %[[MULI_28]] +// CHECK: %[[ANDI_32:.*]] = arith.andi %[[SPLAT_30]], %[[BROADCAST_29]] // CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_31]] // CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_33]], %[[ANDI_32]], %{{.*}} // CHECK: %[[SPLAT_35:.*]] = tt.splat %[[SUBI_23]] // CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_35]] -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[CMPI_36]] -// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[MULI_39:.*]] = arith.muli %[[MULI_30]], %{{.*}} -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[MULI_39]] -// CHECK: %[[ANDI_41:.*]] = arith.andi %[[SPLAT_38]], %[[BROADCAST_37]] +// CHECK: %[[MULI_37:.*]] = arith.muli %[[MULI_28]], %{{.*}} +// CHECK: %[[BROADCAST_38:.*]] = tt.broadcast %[[CMPI_36]] +// CHECK: %[[SPLAT_39:.*]] = tt.splat %[[CMPI_19]] +// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[MULI_37]] +// CHECK: %[[ANDI_41:.*]] = arith.andi %[[SPLAT_39]], %[[BROADCAST_38]] // CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_40]] // CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_42]], %[[ANDI_41]], %{{.*}} // CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} @@ -718,91 +757,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return %18#0 : tensor<32x32xf32, #mma> } -// CHECK-LABEL: tt.func @cross_iter_dep -// CHECK: %{{.*}}:5 = scf.for %[[ARG9:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[INDEX_CAST_9:.*]] = arith.index_cast %[[ARG9]] -// CHECK: %[[MULI_10:.*]] = arith.muli %[[INDEX_CAST_9]], %{{.*}} -// CHECK: %[[SUBI_11:.*]] = arith.subi %{{.*}}, %[[MULI_10]] -// CHECK: %[[SPLAT_12:.*]] = tt.splat %[[SUBI_11]] -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_12]] -// CHECK: %[[BROADCAST_14:.*]] = tt.broadcast %[[CMPI_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %[[ARG11]], %[[BROADCAST_14]], %{{.*}} -// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[SUBI_11]] -// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_16]] -// CHECK: %[[BROADCAST_18:.*]] = tt.broadcast %[[CMPI_17]] -// CHECK: %[[LOAD_19:.*]] = tt.load %[[ARG12]], %[[BROADCAST_18]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_20:.*]] = triton_gpu.convert_layout %[[LOAD_15]] -// CHECK: %[[CONVERT_LAYOUT_21:.*]] = triton_gpu.convert_layout %[[LOAD_19]] -// CHECK: %[[DOT_22:.*]] = tt.dot %[[CONVERT_LAYOUT_20]], %[[CONVERT_LAYOUT_21]], %[[ARG10]] -// CHECK: %[[INDEX_CAST_23:.*]] = arith.index_cast %[[ARG9]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[INDEX_CAST_23]], %{{.*}} -// CHECK: %[[MULI_25:.*]] = arith.muli %[[ADDI_24]], %{{.*}} -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[MULI_25]] -// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %{{.*}}, %[[SPLAT_26]] -// CHECK: %[[MULI_28:.*]] = arith.muli %[[MULI_25]], %{{.*}} -// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[MULI_28]] -// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[SPLAT_29]] -// CHECK: scf.yield %[[DOT_22]], %[[ARG13]], %[[ARG14]], %[[ADDPTR_27]], %[[ADDPTR_30]] -// CHECK: } - - tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #mma> { - %c0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1 = arith.constant 1 : index - %c2_i32 = arith.constant 2 : i32 - %c32_i32 = arith.constant 32 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> - %0 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %1 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %2 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %3 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %4 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %5 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #blocked1> - %6 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked1> - %7 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %8:5 = scf.for %arg9 = %c0 to %c32 step %c1 iter_args(%arg10 = %cst, %arg11 = %0, %arg12 = %1, %arg13 = %3, %arg14 = %4) -> (tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>) { - %9 = arith.index_cast %arg9 : index to i32 - %10 = arith.muli %9, %c32_i32 : i32 - %11 = arith.subi %arg5, %10 : i32 - %12 = tt.splat %11 : i32 -> tensor<32x1xi32, #blocked1> - %13 = arith.cmpi slt, %6, %12 : tensor<32x1xi32, #blocked1> - %14 = tt.broadcast %13 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %15 = tt.load %arg12, %14, %cst_0 : tensor<32x32x!tt.ptr, #blocked1> - %16 = tt.splat %11 : i32 -> tensor<1x32xi32, #blocked1> - %17 = arith.cmpi slt, %5, %16 : tensor<1x32xi32, #blocked1> - %18 = tt.broadcast %17 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %19 = tt.load %arg11, %18, %cst_0 : tensor<32x32x!tt.ptr, #blocked1> - %20 = triton_gpu.convert_layout %19 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %21 = triton_gpu.convert_layout %15 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %22 = tt.dot %20, %21, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %23 = arith.index_cast %arg9 : index to i32 - %24 = arith.addi %23, %c2_i32 : i32 - %25 = arith.muli %24, %c32_i32 : i32 - %26 = tt.splat %25 : i32 -> tensor<32x32xi32, #blocked1> - %27 = tt.addptr %7, %26 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> - %28 = arith.muli %25, %arg7 : i32 - %29 = tt.splat %28 : i32 -> tensor<32x32xi32, #blocked1> - %30 = tt.addptr %2, %29 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> - scf.yield %22, %arg13, %arg14, %27, %30 : tensor<32x32xf32, #mma>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32x!tt.ptr, #blocked1> - } - tt.return %8#0 : tensor<32x32xf32, #mma> - } - // CHECK-LABEL: tt.func @dep_arg_two_uses // CHECK: %{{.*}}:5 = scf.for %[[ARG3:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) // CHECK: %[[SUBI_8:.*]] = arith.subi %{{.*}}, %[[ARG3]] // CHECK: %[[INDEX_CAST_9:.*]] = arith.index_cast %[[SUBI_8]] -// CHECK: %[[SPLAT_10:.*]] = tt.splat %[[INDEX_CAST_9]] -// CHECK: %[[CMPI_11:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_10]] -// CHECK: %[[EXPAND_DIMS_12:.*]] = tt.expand_dims %[[CMPI_11]] {axis = 0 : i32} -// CHECK: %[[EXPAND_DIMS_13:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} -// CHECK: %[[EXTSI_14:.*]] = arith.extsi %[[EXPAND_DIMS_13]] -// CHECK: %[[MULI_15:.*]] = arith.muli %[[EXTSI_14]], %{{.*}} -// CHECK: %[[BROADCAST_16:.*]] = tt.broadcast %[[MULI_15]] -// CHECK: %[[BROADCAST_17:.*]] = tt.broadcast %[[EXPAND_DIMS_12]] +// CHECK: %[[EXPAND_DIMS_10:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} +// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[INDEX_CAST_9]] +// CHECK: %[[EXTSI_12:.*]] = arith.extsi %[[EXPAND_DIMS_10]] +// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_11]] +// CHECK: %[[MULI_14:.*]] = arith.muli %[[EXTSI_12]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_15:.*]] = tt.expand_dims %[[CMPI_13]] {axis = 0 : i32} +// CHECK: %[[BROADCAST_16:.*]] = tt.broadcast %[[MULI_14]] +// CHECK: %[[BROADCAST_17:.*]] = tt.broadcast %[[EXPAND_DIMS_15]] // CHECK: %[[ADDPTR_18:.*]] = tt.addptr %[[ARG4]], %[[BROADCAST_16]] // CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[BROADCAST_17]] // CHECK: %[[SPLAT_20:.*]] = tt.splat %[[ARG6]] @@ -880,6 +847,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- + #blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> @@ -1073,23 +1041,22 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[BROADCAST_13:.*]] = tt.broadcast %[[EXPAND_DIMS_12]] // CHECK: %[[ADDPTR_14:.*]] = tt.addptr %{{.*}}, %[[BROADCAST_13]] // CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]] -// CHECK: %[[EXPAND_DIMS_16:.*]] = tt.expand_dims %{{.*}} {axis = 0 : i32} -// CHECK: %[[SPLAT_17:.*]] = tt.splat %[[MULI_9]] -// CHECK: %[[ADDI_18:.*]] = arith.addi %[[SPLAT_17]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_19:.*]] = tt.expand_dims %[[ADDI_18]] {axis = 1 : i32} -// CHECK: %[[MULI_20:.*]] = arith.muli %[[EXPAND_DIMS_19]], %{{.*}} -// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[MULI_20]] -// CHECK: %[[BROADCAST_22:.*]] = tt.broadcast %[[EXPAND_DIMS_16]] +// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[MULI_9]] +// CHECK: %[[ADDI_17:.*]] = arith.addi %[[SPLAT_16]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_18:.*]] = tt.expand_dims %[[ADDI_17]] {axis = 1 : i32} +// CHECK: %[[MULI_19:.*]] = arith.muli %[[EXPAND_DIMS_18]], %{{.*}} +// CHECK: %[[EXPAND_DIMS_20:.*]] = tt.expand_dims %{{.*}} {axis = 0 : i32} +// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[MULI_19]] +// CHECK: %[[BROADCAST_22:.*]] = tt.broadcast %[[EXPAND_DIMS_20]] // CHECK: %[[BROADCAST_23:.*]] = tt.broadcast %[[ADDPTR_21]] // CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[BROADCAST_23]], %[[BROADCAST_22]] // CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[MULI_20]] +// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[MULI_19]] // CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[ADDPTR_26]] // CHECK: %[[LOCAL_ALLOC_28:.*]] = triton_gpu.local_alloc // CHECK: %[[MEMDESC_SUBVIEW_29:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_28]][%{{.*}}, %{{.*}}, %{{.*}}] // CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_29]] // CHECK: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[MEMDESC_SUBVIEW_29]], %[[ARG9:.*]] = %[[BROADCAST_22]]) - // CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ARG5]], %{{.*}} // CHECK: %[[ADDI_32:.*]] = arith.addi %[[ARG5]], %{{.*}} // CHECK: %[[MULI_33:.*]] = arith.muli %[[ADDI_32]], %{{.*}} @@ -1479,11 +1446,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[SPLAT_21:.*]] = tt.splat %[[SUBI_20]] // CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_21]] // CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_17]] -// CHECK: %[[BROADCAST_24:.*]] = tt.broadcast %[[CMPI_22]] -// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[MULI_26:.*]] = arith.muli %[[MULI_19]], %{{.*}} -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[MULI_26]] -// CHECK: %[[ANDI_28:.*]] = arith.andi %[[SPLAT_25]], %[[BROADCAST_24]] +// CHECK: %[[MULI_24:.*]] = arith.muli %[[MULI_19]], %{{.*}} +// CHECK: %[[BROADCAST_25:.*]] = tt.broadcast %[[CMPI_22]] +// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_23]] +// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[MULI_24]] +// CHECK: %[[ANDI_28:.*]] = arith.andi %[[SPLAT_26]], %[[BROADCAST_25]] // CHECK: %[[ADDPTR_29:.*]] = tt.addptr %{{.*}}, %[[SPLAT_27]] // CHECK: %[[LOAD_30:.*]] = tt.load %[[ADDPTR_29]], %[[ANDI_28]], %{{.*}} // CHECK: %[[ADDI_31:.*]] = arith.addi %[[ARG9]], %{{.*}} @@ -1958,18 +1925,18 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) // CHECK: %[[SUBI_19:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_19]] -// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[ADDPTR_22:.*]] = tt.addptr %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} -// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[CMPI_20]] +// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG6]], %{{.*}} +// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %[[ARG10]], %{{.*}} +// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_19]] +// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[CMPI_22]] // CHECK: %[[IF_25:.*]] = scf.if %[[CMPI_23]] -> (tensor<128x32x!tt.ptr, #blocked1>) { -// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ADDPTR_22]], %{{.*}} +// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ADDPTR_21]], %{{.*}} // CHECK: scf.yield %[[ADDPTR_37]] // CHECK: } else { -// CHECK: scf.yield %[[ADDPTR_22]] +// CHECK: scf.yield %[[ADDPTR_21]] // CHECK: } // CHECK: %[[LOAD_26:.*]] = tt.load %[[IF_25]], %[[SPLAT_24]] diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 00107fbc2e31..01dee425562b 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -21,6 +21,8 @@ #define GEN_PASS_CLASSES #include "TritonAMDGPUTransforms/Passes.h" +#include + using namespace mlir; static bool willIncreaseRegisterPressure(Operation *op) { @@ -32,73 +34,56 @@ static bool willIncreaseRegisterPressure(Operation *op) { return false; } -static bool isDescendent(Operation *op, Block *block) { - Block *b = op->getBlock(); - while (b != nullptr) { - if (b == block) - return true; - b = b->getParentOp()->getBlock(); - } - return false; -} - +// Gather cone of DFG from the op's basic block. +// - Collect dfg breadth first to keep relative order and +// reverse order for insertion after. An op may be captured +// multiple times if DFG reconverges and it will be moved multiple +// times to keep dominance correctness. +// - Returns bool if this DFG leads to a load op. This +// condition is not desirable for moving ttg.local_stores +// early. static bool gatherDFG(Operation *op, Block *block, SmallVector &dfg) { - // BFS (filo) - SmallVector oprs; bool leadsToLoad = false; - for (auto operand : op->getOperands()) { - if (Operation *pop = operand.getDefiningOp()) { - if (isDescendent(pop, block)) { - // only move ops that reside in same block - if (pop->getBlock() == block) - dfg.push_back(pop); - oprs.push_back(pop); - leadsToLoad |= isa(pop); - } else { - // only operands from current block or ancestor - assert(isDescendent(block->getParentOp(), pop->getBlock())); - } - } - } - // check sub-regions - for (auto &subregion : op->getRegions()) { - for (auto &subblock : subregion) { - for (auto &sop : subblock) { - if (gatherDFG(&sop, block, dfg)) - leadsToLoad = true; + + std::list oprs{op}; + auto checkOperands = [&](Operation *cop) { + for (auto operand : cop->getOperands()) { + if (Operation *oprOp = operand.getDefiningOp()) { + Block *oprBlk = oprOp->getBlock(); + if (block->findAncestorOpInBlock(*oprOp)) { + // only move ops that reside in same block + if (oprBlk == block) + dfg.push_back(oprOp); + oprs.push_back(oprOp); + leadsToLoad |= isa(oprOp); + } else { + // should always be in parent block + assert(oprBlk->findAncestorOpInBlock(*block->getParentOp())); + } } } - } + }; - // process next level ops - for (auto *op : oprs) { - if (gatherDFG(op, block, dfg)) - leadsToLoad = true; + // BFS (filo) + while (oprs.size()) { + Operation *nop = oprs.front(); + oprs.pop_front(); + // check next op and sub-regions + nop->walk(checkOperands); } return leadsToLoad; } -static bool hasAtomic(Operation *op) { - if (isa(op)) - return true; - for (auto &subregion : op->getRegions()) { - for (auto &subblock : subregion) { - for (auto &sop : subblock) { - if (hasAtomic(&sop)) - return true; - } - } - } - return false; -} - -static llvm::ilist::iterator findEarlyLocation( - Block *block, Operation *op, Value src) { +// Search thru block to find earliest insertion point for move +// op. This can be either an atomic op or last usage of source pointer. +// Search ends when move op encountered. +static llvm::ilist::iterator +findEarlyInsertionPoint(Block *block, Operation *move, Value src) { auto loc = block->begin(); for (auto bi = block->begin(); bi != block->end(); ++bi) { - auto *bop = &*bi; - if (bop == op) // don't move later than current location + auto *op = &*bi; + if (op == move) // don't move later than current location break; if (src) { // check for ops accessing src @@ -108,8 +93,10 @@ static llvm::ilist::iterator findEarlyLocation( } } // atomics used for syncronization? - if (hasAtomic(bop)) - loc = bi; + op->walk([&](Operation *wop) { + if (isa(wop)) + loc = bi; + }); } return loc; } @@ -160,15 +147,12 @@ class TritonAMDGPUReorderInstructionsPass moveAfter(op, argOp); }); SmallVector moveOps; - // Move local stores early if it's global load is outside loop - m.walk([&](triton::gpu::LocalStoreOp op) { - moveOps.push_back(op); - }); - // Move global loads early (prefetch) - // - these should be moved last - m.walk([&](triton::LoadOp op) { - moveOps.push_back(op); - }); + // Move local stores early if dependence distance greater than + // one iteration. + m.walk([&](triton::gpu::LocalStoreOp op) { moveOps.push_back(op); }); + // Move global loads early (prefetch). These should be first in + // the block since they have the longest latency. + m.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); for (auto op : moveOps) { // 0. gather DFG Block *block = op->getBlock(); @@ -178,14 +162,12 @@ class TritonAMDGPUReorderInstructionsPass Value src; if (auto ld = dyn_cast(op)) src = ld.getPtr(); - // 0. find earliest insertion point - auto loc = findEarlyLocation(block, op, src); - // 1. move to beginning of enclosing block - for (auto *op : dfg) { - // only move up (not down) - if (loc->isBeforeInBlock(op)) - op->moveAfter(block, loc); - } + auto ip = findEarlyInsertionPoint(block, op, src); + // Remove ops that already precede the insertion point. + llvm::remove_if( + dfg, [&](Operation *op) { return !ip->isBeforeInBlock(op); }); + for (auto *op : dfg) + op->moveAfter(block, ip); } } } From 768ed95309ea773cc09a719f97666d68d53a0205 Mon Sep 17 00:00:00 2001 From: SJW Date: Tue, 25 Jun 2024 19:30:28 +0000 Subject: [PATCH 07/36] * fixed pruning --- .../amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 01dee425562b..70caa21f4020 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -164,7 +164,7 @@ class TritonAMDGPUReorderInstructionsPass src = ld.getPtr(); auto ip = findEarlyInsertionPoint(block, op, src); // Remove ops that already precede the insertion point. - llvm::remove_if( + llvm::erase_if( dfg, [&](Operation *op) { return !ip->isBeforeInBlock(op); }); for (auto *op : dfg) op->moveAfter(block, ip); From 452a3fa739236cb6f33751300f72a897c3fdf12e Mon Sep 17 00:00:00 2001 From: SJW Date: Wed, 26 Jun 2024 23:25:18 +0000 Subject: [PATCH 08/36] * updated test --- test/TritonGPU/amd/amd-stream-pipeline.mlir | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index 7ac05abfb222..de6fcf4a9216 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -1462,7 +1462,7 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { // CHECK-LABEL: dot_prologue_epilogue - // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + // CHECK-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> @@ -1486,8 +1486,8 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] - // CHECK-NOT load + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] to + // CHECK-NOT: load // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] // CHECK: scf.if %[[CND]] // CHECK: dot @@ -1559,8 +1559,9 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] - // CHECK-NOT load + // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] to + // CHECK: load + // CHECK-NOT: load // CHECK: dot // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] // CHECK: %[[IFRET:.*]]:2 = scf.if %[[CND]] From e344245e412b0e86169f2f6c8b3b545038da3724 Mon Sep 17 00:00:00 2001 From: SJW Date: Thu, 27 Jun 2024 19:17:46 +0000 Subject: [PATCH 09/36] * invert order of loads and local_stores --- .../ReorderInstructions.cpp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 70caa21f4020..f46b5a2d6460 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -147,14 +147,13 @@ class TritonAMDGPUReorderInstructionsPass moveAfter(op, argOp); }); SmallVector moveOps; - // Move local stores early if dependence distance greater than - // one iteration. - m.walk([&](triton::gpu::LocalStoreOp op) { moveOps.push_back(op); }); - // Move global loads early (prefetch). These should be first in - // the block since they have the longest latency. + // Move global loads early to prefetch. m.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); + // Move local_stores early if dependence distance greater than + // one iteration. Best perf on GEMM when these precede global loads. + m.walk([&](triton::gpu::LocalStoreOp op) { moveOps.push_back(op); }); for (auto op : moveOps) { - // 0. gather DFG + // 0. Gather use-def chain in block. Block *block = op->getBlock(); SmallVector dfg{op}; bool leadsToLoad = gatherDFG(op, block, dfg); @@ -163,9 +162,12 @@ class TritonAMDGPUReorderInstructionsPass if (auto ld = dyn_cast(op)) src = ld.getPtr(); auto ip = findEarlyInsertionPoint(block, op, src); - // Remove ops that already precede the insertion point. - llvm::erase_if( - dfg, [&](Operation *op) { return !ip->isBeforeInBlock(op); }); + // Remove ops that already precede the insertion point. This + // is done before moves happen to avoid N^2 complexity in + // `Operation::isBeforeInBlock`. + llvm::erase_if(dfg, + [&](Operation *op) { return !ip->isBeforeInBlock(op); }); + // Move ops to insertion point. for (auto *op : dfg) op->moveAfter(block, ip); } From cd8018dd9a671ea24239d451363cc931870734cd Mon Sep 17 00:00:00 2001 From: SJW Date: Tue, 16 Jul 2024 18:17:52 +0000 Subject: [PATCH 10/36] * Removed outer loop pipelining. It does not improve perf and may be replaced with loop fusion * Reorder will not move loads/local_stores over loops --- .../amd/amd-reorder-instructions.mlir | 38 ++-- .../ReorderInstructions.cpp | 5 +- .../TritonAMDGPUTransforms/StreamPipeline.cpp | 199 +++++------------- 3 files changed, 76 insertions(+), 166 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index e2870515654e..9b05df4b1f9f 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -350,31 +350,31 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}, %[[ARG15:.*]] = %{{.*}}) // CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_28]], %[[SPLAT_27]] -// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[CMPI_26]] -// CHECK: %[[MULI_32:.*]] = arith.muli %{{.*}}, %[[LOAD_31]] -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[MULI_32]] -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[SPLAT_33]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_34]] -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_41]] +// CHECK: %[[ADDI_26:.*]] = arith.addi %[[ARG11]], %{{.*}} +// CHECK: %[[CMPI_27:.*]] = arith.cmpi slt, %[[ADDI_26]], %{{.*}} +// CHECK: %[[SELECT_28:.*]] = arith.select %[[CMPI_27]], %[[ADDI_26]], %{{.*}} +// CHECK: %[[MEMDESC_SUBVIEW_29:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_28]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_29]] +// CHECK: %[[MEMDESC_SUBVIEW_30:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_28]], %{{.*}}, %{{.*}}] +// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_30]] +// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[CMPI_31]] +// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_32]] +// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[CMPI_31]] +// CHECK: %[[MULI_37:.*]] = arith.muli %{{.*}}, %[[LOAD_36]] +// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[MULI_37]] +// CHECK: %[[SPLAT_39:.*]] = tt.splat %[[CMPI_31]] +// CHECK: %[[ADDPTR_40:.*]] = tt.addptr %{{.*}}, %[[SPLAT_38]] +// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_40]], %[[SPLAT_39]] // CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG10]], %{{.*}} // CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} // CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} // CHECK: %[[LOCAL_LOAD_45:.*]] = triton_gpu.local_load %[[ARG12]] // CHECK: %[[LOCAL_LOAD_46:.*]] = triton_gpu.local_load %[[ARG13]] // CHECK: %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_45]], %[[LOCAL_LOAD_46]], %[[ARG7]] -// CHECK: scf.yield %[[DOT_47]], %[[ADDPTR_28]], %[[ADDPTR_30]], %[[SELECT_44]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]], %[[LOAD_29]], %[[LOAD_36]] +// CHECK: scf.yield %[[DOT_47]], %[[ADDPTR_33]], %[[ADDPTR_35]], %[[SELECT_44]], %[[SELECT_28]], %[[MEMDESC_SUBVIEW_29]], %[[MEMDESC_SUBVIEW_30]], %[[LOAD_34]], %[[LOAD_41]] // CHECK: } tt.func @indirect_bmm_scalar(%arg0: i64 {tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: !tt.ptr, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index f46b5a2d6460..6de4d455a60b 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -96,6 +96,8 @@ findEarlyInsertionPoint(Block *block, Operation *move, Value src) { op->walk([&](Operation *wop) { if (isa(wop)) loc = bi; + if (isa(wop)) + loc = bi; }); } return loc; @@ -152,8 +154,9 @@ class TritonAMDGPUReorderInstructionsPass // Move local_stores early if dependence distance greater than // one iteration. Best perf on GEMM when these precede global loads. m.walk([&](triton::gpu::LocalStoreOp op) { moveOps.push_back(op); }); + for (auto op : moveOps) { - // 0. Gather use-def chain in block. + // Gather use-def chain in block. Block *block = op->getBlock(); SmallVector dfg{op}; bool leadsToLoad = gatherDFG(op, block, dfg); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index fbdcb99b857a..ab5b397d1f2b 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -24,7 +24,7 @@ // Software pipeliners are usually separated into two pieces, one that create a // modulo schedule and an expander that rewrites the loop and emits a prologue // and epilogue. This pass first calls a helper that will pre-process the IR -// to create async operations and create a modulo schedule. Then we call the +// to create stream operations and create a modulo schedule. Then we call the // expander to generate the prologue and new loop. //===----------------------------------------------------------------------===// @@ -41,9 +41,6 @@ using namespace mlir; namespace tt = mlir::triton; namespace ttg = mlir::triton::gpu; -// TODO: We can extra some helpers into common utilities once we add more -// schedules. - namespace { struct LoadInfo { @@ -69,12 +66,12 @@ static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { yieldOp->erase(); } -static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, - Value insertIdx, Value extractIdx, - tt::CoarseSchedule &schedule, - tt::CoarseSchedule::Cluster prefetchCluster, - llvm::MapVector &loadToInfo, - int numStages) { +static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, + Value insertIdx, Value extractIdx, + tt::CoarseSchedule &schedule, + tt::CoarseSchedule::Cluster prefetchCluster, + llvm::MapVector &loadToInfo, + int numStages) { OpBuilder builder(forOp); Value zero = builder.create(forOp.getLoc(), 0, 32); // Replace the load with insert/extract slice. @@ -140,8 +137,7 @@ static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, builder.create(loc, loadOp.getType(), viewLoad); auto result = sharedLoad->getResults(); - // Create a select for non-zero other values as they are not handled by - // AsyncCopyGlobalToLocalOp for now. + // Create a select for non-zero other values. Value other = loadOp.getOther(); if (other && !isZeroConst(other)) { auto select = builder.create( @@ -235,7 +231,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { [&](Operation *op, int distance, Operation *use) { if (!seen.insert(op).second) return; - if (isa(op)) { + if (isa(op)) { // TODO: What if there are multiple uses at different distances? loadOpToIndLevelAndUse.push_back(std::make_tuple(op, distance, use)); use = op; @@ -261,7 +257,7 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { // that are not directly used by dot ops. if (forOp->hasAttr(tt::kNumStagesAttrName)) { for (Operation &op : forOp.getBody()->without_terminator()) { - if (!isa(op)) + if (!isa(op)) dfs(&op, 0, &op); } } @@ -281,32 +277,28 @@ assignMemoryLayouts(llvm::SmallVector> continue; LoadInfo loadInfo; - if (auto loadOp = dyn_cast(op)) { - assert(!isLoadFromTensorPtr(loadOp) && - "Block ptr should have been lowered before this pass."); - auto ptr = loadOp.getPtr(); - unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); - if (auto mask = loadOp.getMask()) - vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); + auto loadOp = dyn_cast(op); + assert(!isLoadFromTensorPtr(loadOp) && + "Block ptr should have been lowered before this pass."); + auto ptr = loadOp.getPtr(); + unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); + if (auto mask = loadOp.getMask()) + vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); - auto tensorTy = dyn_cast(ptr.getType()); - if (!tensorTy) - continue; - auto ty = - cast(tensorTy.getElementType()).getPointeeType(); - unsigned width = vec * ty.getIntOrFloatBitWidth(); - - // We do not pipeline all loads for the following reasons: - // 1. On nvidia GPUs, cp.async's cp-size can only be 4, 8, or 16. - // 2. It's likely that pipling small loads won't offer much performance - // improvement and may even hurt performance by increasing register - // pressure. - LDBG("Load " << *loadOp << " has width " << width); - if (width < 32) - continue; - } + auto tensorTy = dyn_cast(ptr.getType()); + if (!tensorTy) + continue; + + auto ty = cast(tensorTy.getElementType()).getPointeeType(); + unsigned width = vec * ty.getIntOrFloatBitWidth(); + + // Limit shared memory sharing to width >= 32 elements. + LDBG("Load " << *loadOp << " has width " << width); + if (width < 32) + continue; if (use->hasTrait()) { + // Only use shared memory when feeding a dot op loadInfo.usedByDot = true; loadInfo.sharedEncoding = getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr); @@ -327,9 +319,7 @@ assignMemoryLayouts(llvm::SmallVector> // encoding. if (!loadInfo.sharedEncoding) { // Also pipeline in-register buffers. - if (auto loadOp = dyn_cast(op)) { - loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); - } + loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); } loadToInfo[op] = loadInfo; @@ -412,66 +402,6 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, return loadToInfo; } -// Schedule the prologue and epilogue `if` ops in the loop, pushing them as -// close to the loop boundaries as possible. Return the cluster after the -// prologue (or the beginning of the loop if there is no prologue). -static tt::CoarseSchedule::Cluster -schedulePrologueAndEpilogue(scf::ForOp forOp, tt::CoarseSchedule &schedule, - DenseSet &rootUsers, int numStages) { - tt::CoarseSchedule::Cluster afterPrologue = schedule.clusters.begin(); - - // Look for the IfOp that is in the backward slice any of the currently - // scheduled ops and put it at the beginning of the loop. - DenseMap ifsToStage; - // Go stage by stage. - for (int stage = 0; stage < numStages; stage++) { - for (auto [op, stage_, cluster] : schedule.getOpsInOrder(forOp)) { - if (stage_ != stage) - continue; - SetVector backwardSlice; - BackwardSliceOptions opt; - opt.omitBlockArguments = true; - getBackwardSlice((Operation *)op, &backwardSlice, opt); - - for (auto op : backwardSlice) { - if (auto ifOp = dyn_cast(op)) { - ifsToStage.insert({ifOp, stage}); - } - } - } - } - tt::CoarseSchedule::Cluster prologueCluster = schedule.clusters.newAtFront(); - for (auto [ifOp, stage] : ifsToStage) { - schedule.insert(ifOp, stage, prologueCluster); - } - - // Look for the IfOp that is in the forward slice of the root users and put it - // at the end of the loop. - tt::CoarseSchedule::Cluster epilogueCluster = schedule.clusters.newAtBack(); - for (auto rootUser : rootUsers) { - SetVector forwardSlice; - getForwardSlice(rootUser, &forwardSlice); - - int stage = schedule[rootUser].first; - for (auto op : forwardSlice) { - scf::IfOp ifOp = dyn_cast(op); - if (ifOp == nullptr) { - // check if the op is in the body of an if op that's part of the loop - auto parentOp = op->getParentOp(); - if (parentOp != nullptr && - parentOp->getParentOp() == forOp.getOperation()) { - ifOp = dyn_cast(parentOp); - } - } - if (ifOp) { - schedule.insertIfAbsent(ifOp, stage, - epilogueCluster); // after prefetch extracts - } - } - } - return afterPrologue; -} - // Add dependencies of anchor ops to the coarse schedule. Schedule them to // the same stage and ordering cluster as the anchor op. static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, @@ -600,9 +530,9 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, // Convert load ops into their asyn version and apply multi-buffering based on // the required number of buffers. static SmallVector -createAsyncOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, - llvm::MapVector &loadToInfo, - int numStages) { +createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, + llvm::MapVector &loadToInfo, + int numStages) { // Calculate the number of buffers needed for each load. // TODO pawel: we could do more fine-grained allocation here and // allocate only the number of buffers that specific loads need. @@ -677,8 +607,8 @@ createAsyncOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, for (auto &pair : asyncLoads) { if (auto loadOp = dyn_cast(pair.first)) { - createAsyncCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, - schedule, prefetchCluster, loadToInfo, numStages); + createStreamCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, + schedule, prefetchCluster, loadToInfo, numStages); } } SmallVector newYieldOperands = {insertIdx, extractIdx}; @@ -709,19 +639,14 @@ preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, // Convert the loads into async loads and create the allocs. SmallVector allocs = - createAsyncOps(forOp, coarseSchedule, loadToInfo, numStages); + createStreamOps(forOp, coarseSchedule, loadToInfo, numStages); LLVM_DEBUG({ - LDBG("Coarse schedule with async loads:"); + LDBG("Coarse schedule with stream loads:"); coarseSchedule.dump(); }); - tt::CoarseSchedule::Cluster afterPrologue = - schedulePrologueAndEpilogue(forOp, coarseSchedule, rootUsers, numStages); - LLVM_DEBUG({ - LDBG("Coarse schedule with prologue and epilogue:"); - coarseSchedule.dump(); - }); + tt::CoarseSchedule::Cluster afterPrologue = coarseSchedule.clusters.begin(); scheduleDependencies(forOp, coarseSchedule, numStages); LLVM_DEBUG({ @@ -768,7 +693,7 @@ preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, } // Return true if the preconditions for pipelining the loop are met. -static bool preCondition(scf::ForOp forOp) { +static bool preConditionInner(scf::ForOp forOp) { // Skip loop with distance > 1 for now. // TODO: relax the constraint in the expander. if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), @@ -791,22 +716,9 @@ static bool preCondition(scf::ForOp forOp) { return true; } -static void tryAndPipelineOuterLoop(scf::ForOp forOp) { - mlir::triton::PipeliningOption options; - bool foundSchedule = false; - // Limit 2 stages to not require extra shared memory. - foundSchedule = getOuterLoopSchedule(forOp, /*numStage=*/2, options); - if (!foundSchedule) - return; - IRRewriter rewriter(forOp->getContext()); - rewriter.setInsertionPoint(forOp); - FailureOr newForOp = - mlir::triton::pipelineForLoop(rewriter, forOp, options); -} - static bool pipelineLoop(scf::ForOp forOp, int numStages) { mlir::triton::PipeliningOption options; - if (!preCondition(forOp)) + if (!preConditionInner(forOp)) return false; bool foundSchedule = false; @@ -851,29 +763,24 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineBase { if (loops.empty()) return; - llvm::SmallSetVector outerLoops; + bool pipelined = false; for (scf::ForOp forOp : loops) { auto outerLoop = dyn_cast(forOp->getParentOp()); int loopNumStages = getNumStagesOrDefault(forOp); - bool pipelined = pipelineLoop(forOp, loopNumStages); - if (pipelined && outerLoop && getNumStagesOrDefault(outerLoop) > 1) - outerLoops.insert(outerLoop); + pipelined |= pipelineLoop(forOp, loopNumStages); } - // Clean up arithmetic before applying the next level of pipelining to - // simplify the IR. - auto arithDialect = - getOperation().getContext()->getLoadedDialect(); - RewritePatternSet patterns(getOperation().getContext()); - arithDialect->getCanonicalizationPatterns(patterns); - if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) - .failed()) - return signalPassFailure(); - - // Try to pipeline the outer loop to overlap the prologue and epilogue of - // the inner loop. - for (scf::ForOp outerLoop : outerLoops) - tryAndPipelineOuterLoop(outerLoop); + if (pipelined) { + // Clean up arithmetic before applying the next level of pipelining to + // simplify the IR. + auto arithDialect = + getOperation().getContext()->getLoadedDialect(); + RewritePatternSet patterns(getOperation().getContext()); + arithDialect->getCanonicalizationPatterns(patterns); + if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) + .failed()) + signalPassFailure(); + } } }; } // anonymous namespace From faf95cb7278f84aca5127deb19ee7a5eec6e81ae Mon Sep 17 00:00:00 2001 From: SJW Date: Wed, 17 Jul 2024 17:25:04 +0000 Subject: [PATCH 11/36] * cleanup tests --- .../amd/amd-reorder-instructions.mlir | 78 +++++++++---------- test/TritonGPU/amd/amd-stream-pipeline.mlir | 76 +++++++++--------- 2 files changed, 77 insertions(+), 77 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 9b05df4b1f9f..97b52dfbb75f 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -60,15 +60,15 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war // yield // } -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> #shared2 = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> #shared3 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> #shared4 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80"} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, triton_gpu.target = "hip:gfx942"} { // CHECK-LABEL: tt.func @matmul_loop // CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}) @@ -848,12 +848,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @load_two_users // CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) @@ -940,12 +940,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @load_two_users_incompatible_layouts // CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}) @@ -1026,10 +1026,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @nested_loops // CHECK: scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { @@ -1159,13 +1159,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de // CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) @@ -1321,11 +1321,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @indirect_load_shared_layout // CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) @@ -1431,10 +1431,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @kernel_yield_constant // CHECK: %{{.*}}:4 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) @@ -1531,8 +1531,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @add_kernel // CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) @@ -1625,11 +1625,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @nested_loops // CHECK: scf.for %[[ARG1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { @@ -1713,11 +1713,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @load_convert_layout // CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) @@ -1830,9 +1830,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @matmul_indirect_pipeline // CHECK: %{{.*}}:4 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) @@ -1915,11 +1915,11 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80"} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, triton_gpu.target = "hip:gfx942"} { // CHECK-LABEL: tt.func @matmul_nested_ops // CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) @@ -2017,12 +2017,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @dot_prologue_epilogue // CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) @@ -2120,12 +2120,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:80", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @pipeline_downstream_dependencies // CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) @@ -2211,8 +2211,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func public @masked_add_kernel // CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index de6fcf4a9216..3d7b65024bda 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -55,7 +55,7 @@ // CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] // CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "hip:gfx942"} { tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -702,12 +702,12 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @load_two_users tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> @@ -762,12 +762,12 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: tt.func @load_two_users_incompatible_layouts tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> @@ -844,9 +844,9 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // We check that there is no allocation before the first occurrence of // scf.for because that would mean that the first load `%a = load()` // would be pipelined. -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> @@ -904,12 +904,12 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de // CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> %c64_i32 = arith.constant 64 : i32 @@ -1041,13 +1041,13 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] // CHECK: } -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> #BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> #C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -1092,9 +1092,9 @@ tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibilit // CHECK: triton_gpu.memdesc_subview // CHECK: triton_gpu.local_store // CHECK: tt.return -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> @@ -1147,8 +1147,8 @@ module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : // CHECK: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] // CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] // CHECK: scf.for -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %c1024_i32 = arith.constant 1024 : i32 %c0_i32 = arith.constant 0 : i32 @@ -1211,11 +1211,11 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : // CHECK: } // CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_14]] -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> %c1_i32 = arith.constant 1 : i32 @@ -1254,8 +1254,8 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // This test triggered some failure in the verifier, so we only // included a simple check for the kernel name. // CHECK-LABEL: @load_convert_layout -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> #BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> #BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> @@ -1263,7 +1263,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -1305,9 +1305,9 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // This test captured some ICE in MatmulLoopPipeline pass, so we only // included a simple check for the kernel name. // CHECK-LABEL: @matmul_indirect_pipeline -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %c1_i32 = arith.constant 1 : i32 @@ -1351,7 +1351,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK-NOT: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> %c128_i32 = arith.constant 128 : i32 @@ -1393,8 +1393,8 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK-LABEL: @matmul_nested_ops // CHECK: triton_gpu.local_load -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> +#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> #BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> #BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> @@ -1402,7 +1402,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "hip:gfx942"} { tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}, @@ -1454,13 +1454,13 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, // ----- // Pipeline the if ops at the beginning and the end of the loop -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> #mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: dot_prologue_epilogue // CHECK-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { @@ -1526,13 +1526,13 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // ----- // Verify that uses of the ops scheduled in partucular place of the loop (like epilogue if) are correctly scheduled too. -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> #mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { // CHECK-LABEL: pipeline_downstream_dependencies // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { @@ -1605,8 +1605,8 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] // CHECK: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %c1024_i32 = arith.constant 1024 : i32 %c0_i32 = arith.constant 0 : i32 From c0ff506fd586dd86961d1c73c7addd8649f72152 Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 22 Jul 2024 18:34:24 +0000 Subject: [PATCH 12/36] * Restore old stream-pipeliner and moved new to StreamPipelineV2.cpp * Added TRITONAMD_OLD_STREAM_PIPELINER env variable to temporarily select old pipeliner --- third_party/amd/backend/compiler.py | 6 +- .../include/TritonAMDGPUTransforms/Passes.h | 4 +- .../include/TritonAMDGPUTransforms/Passes.td | 13 + .../lib/TritonAMDGPUTransforms/CMakeLists.txt | 1 + .../TritonAMDGPUTransforms/StreamPipeline.cpp | 1421 +++++++++-------- .../StreamPipelineV2.cpp | 791 +++++++++ third_party/amd/python/triton_amd.cc | 6 +- 7 files changed, 1563 insertions(+), 679 deletions(-) create mode 100644 third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 640fdf3200ed..713d5543925f 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -150,7 +150,11 @@ def make_ttgir(mod, metadata, options): amd.passes.ttgpuir.add_optimize_epilogue(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) if amd.has_matrix_core_feature(options.arch): - amd.passes.ttgpuir.add_stream_pipeline(pm, options.num_stages) + if os.getenv("TRITONAMD_OLD_STREAM_PIPELINE", "0") == "1": + if options.num_stages == 0: + amd.passes.ttgpuir.add_stream_pipeline(pm) + else: + amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages) passes.common.add_canonicalizer(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) passes.ttgpuir.add_remove_layout_conversions(pm) diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h index 914bce6fd644..293ee924f05e 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h @@ -6,7 +6,9 @@ namespace mlir { -std::unique_ptr createTritonAMDGPUStreamPipelinePass(int numStages = 2); +std::unique_ptr createTritonAMDGPUStreamPipelinePass(); + +std::unique_ptr createTritonAMDGPUStreamPipelineV2Pass(int numStages = 2); std::unique_ptr createTritonAMDGPUAccelerateMatmulPass(std::string archGenName = std::string(), diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td index 5f61e649bfdf..5a6df1827fe4 100644 --- a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td +++ b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td @@ -14,6 +14,19 @@ def TritonAMDGPUStreamPipeline : Pass<"tritonamdgpu-stream-pipeline", "mlir::Mod let constructor = "mlir::createTritonAMDGPUStreamPipelinePass()"; let dependentDialects = []; +} + +def TritonAMDGPUStreamPipelineV2 : Pass<"tritonamdgpu-stream-pipeline-v2", "mlir::ModuleOp"> { + let summary = "pipeline"; + + let description = [{ + Pipeline global loads through registers to shared memory while computing on previous + tile + }]; + + let constructor = "mlir::createTritonAMDGPUStreamPipelineV2Pass()"; + + let dependentDialects = []; let options = [ Option<"numStages", "num_stages", diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt index d96860c3ef90..5bacc66a1161 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt +++ b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt @@ -3,6 +3,7 @@ add_triton_library(TritonAMDGPUTransforms OptimizeEpilogue.cpp ReorderInstructions.cpp StreamPipeline.cpp + StreamPipelineV2.cpp MfmaGroup.cpp DEPENDS diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp index ab5b397d1f2b..8bdf9d11751d 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp @@ -3,789 +3,860 @@ #include "mlir/IR/IRMapping.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Interfaces/SideEffectInterfaces.h" -#include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" -#include "triton/Dialect/TritonGPU/Transforms/Passes.h" -#include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" -#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" -#include "triton/Dialect/TritonGPU/Transforms/Schedule.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include "triton/Tools/Sys/GetEnv.hpp" -#include "llvm/Support/Debug.h" - -#include +#include "llvm/ADT/MapVector.h" //===----------------------------------------------------------------------===// -// This file will create a schedule that will be handed over to the pipeline -// expander. -// Software pipeliners are usually separated into two pieces, one that create a -// modulo schedule and an expander that rewrites the loop and emits a prologue -// and epilogue. This pass first calls a helper that will pre-process the IR -// to create stream operations and create a modulo schedule. Then we call the -// expander to generate the prologue and new loop. +// This file implements stream software pipelining for loops. The implementation +// here is inspired by the pipeline pass in Triton and the rocMLIR pipeliner. +// +// We divide the loop body into the following phases: +// a. Pre-load operations: for instance, index computation. +// b. Load operations: loading from global memory to shared memory. +// c. Compute operations: for instance, Triton dot. +// d. Post-load operations: for instance, index computation. +// +// To pipeline the loop, we need to: +// - Find all the dependencies of the load operations. +// - Prologue: Hoist the pipelinable load operations and shared memory store +// for the ramp up stage +// - Pipelined Loop: Assemble the loop body minus last iteration +// - Prefetch next tile from global into regs (while computing from previous) +// - Non-load loop body +// - Store next tile into shared mem +// - Epilogue: Peeled non-load loop body for last iteration +// //===----------------------------------------------------------------------===// +using llvm::MapVector; +using namespace mlir; +namespace ttg = triton::gpu; + #define GEN_PASS_CLASSES #include "TritonAMDGPUTransforms/Passes.h.inc" -#define DEBUG_TYPE "tritonamdgpu-stream-pipeline" -#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") -#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") +namespace { -#define int_attr(num) builder.getI64IntegerAttr(num) +class LoopPipeliner { + /// Cache of ForOp and YieldOp related to this pipeliner. + scf::ForOp forOp; + scf::YieldOp yieldOp; -using namespace mlir; -namespace tt = mlir::triton; -namespace ttg = mlir::triton::gpu; + bool peelLastIter = true; -namespace { + /// The new pipelined ForOp. + scf::ForOp pplForOp; -struct LoadInfo { - // Layout of the data in the shared memory. - ttg::SharedEncodingAttr sharedEncoding = nullptr; - // Blocked encoding is used for loads not used by the dot. - ttg::BlockedEncodingAttr blockedEncoding = nullptr; - int distToUse = 0; - bool usedByDot = false; -}; + /// Loads to be pipelined + SetVector validLoads; + /// The value that each load will be mapped to (after layout conversion) + DenseMap convertMapping; + /// load => buffer + DenseMap loadsBuffer; + /// load => buffer type (with shared layout after swizzling) + DenseMap loadsBufferType; -} // namespace + /// Iterator values + Value nextLoopCond; -// Replace the ForOp's yield with a new one with the given operands appended. -static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { - // Fix up the yield op. - Operation *yieldOp = forOp.getBody()->getTerminator(); - SmallVector operands(yieldOp->getOperands()); - operands.append(newOperands.begin(), newOperands.end()); + /// Yield values + SmallVector yieldValues; - OpBuilder builder(yieldOp); - builder.create(yieldOp->getLoc(), operands); - yieldOp->erase(); -} + /// The number of stages in the pipeline is fixed to '2' for + /// analysis since there will be a current buffer stored in + /// shared mem and a next buffer stored in regs. + int numStages = 2; -static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, - Value insertIdx, Value extractIdx, - tt::CoarseSchedule &schedule, - tt::CoarseSchedule::Cluster prefetchCluster, - llvm::MapVector &loadToInfo, - int numStages) { - OpBuilder builder(forOp); - Value zero = builder.create(forOp.getLoc(), 0, 32); - // Replace the load with insert/extract slice. - builder.setInsertionPoint(loadOp); - Location loc = loadOp.getLoc(); - Value src = loadOp.getPtr(); - Value mask = loadOp.getMask(); - Value other = loadOp.getOther(); - if (!isExpensiveLoadOrStore(loadOp) && loadToInfo[loadOp].blockedEncoding) { - // For inexpensive loads that do not directly feed into dot ops - // we want to use optimal layout for the data. - ttg::BlockedEncodingAttr encoding = loadToInfo[loadOp].blockedEncoding; - auto convertBlockLayout = [&](Value src) { - auto ty = cast(src.getType()); - auto newTy = - RankedTensorType::get(ty.getShape(), ty.getElementType(), encoding); - auto cvt = - builder.create(loadOp->getLoc(), newTy, src); - return cvt.getResult(); - }; - src = convertBlockLayout(src); - if (mask) - mask = convertBlockLayout(mask); - if (other) - other = convertBlockLayout(other); + /// Arg indicies + size_t depArgsBeginIdx; + DenseMap depArgsIdx; + + /// value (in loop) => value at stage N + DenseMap> valueMapping; + /// loop iter arg => value + DenseMap depArgsMapping; + + /// forOp value => pplForOp value + IRMapping curMapping; + /// forOp value => prefetch value + IRMapping nextMapping; + + /// Dependency ops by program order + SmallVector orderedDeps; + + SetVector currentDeps; + + /// block arguments that loads depend on + SetVector depArgs; + + /// operation => source operand defined stages + DenseMap> immediateOpStages; + + /// operations that loads depend on + SetVector depOps; + + /// Collect values that `v` depends on and are defined inside the loop + void collectValueDep(Value v, int stage, SetVector &deps, + SetVector &args); + + /// Collect all op dependencies + void collectDeps(SetVector &ops, + MapVector> &opDeps); + + void collectDepChain(Operation *op, SetVector &ops); + + /// Check if none of the for-ops has valid uses + LogicalResult checkOpUses(); + + /// Check if ops have dependencies that are not pipelinable + LogicalResult checkOpDeps(); + + void createBufferTypes(); + + void createOrderedDeps(); + + void createCurrentDeps(); + + /// Return the stage at which `v` is defined prior to `stage` + int getValueDefStage(Value v, int stage); + + /// Map `origin` to `newValue` at `stage` + void setValueMapping(Value origin, Value newValue, int stage); + + /// Map `origin` to `newValue` at `stage` according to the association between + /// yieldOp and forOp + void setValueMappingYield(Value origin, Value newValue, int stage); + + /// Map `origin` to `newValue` at the next stage according to the association + /// between yieldOp and forOp + void setValueMappingYield(Value origin, Value newValue); + + /// Return the value mapped to `origin` at `stage`, if it exists. + Value lookupOrDefault(Value origin, int stage); + + Value getLoadMask(triton::LoadOp loadOp, Value mappedMask, Value loopCond, + OpBuilder &builder); + /// Collect all args of the new loop + SmallVector collectNewLoopArgs(); + + /// Clone the forOp and return the new forOp + scf::ForOp cloneForOp(ArrayRef newLoopArgs, OpBuilder &builder); + + void updateLoadMask(triton::LoadOp loadOp, Value newMask); + /// Prefetch the next iteration for `pplForOp` + void prefetchNextBuffer(OpBuilder &builder); + void cloneCurrentBody(OpBuilder &builder); + void storeNextBuffer(OpBuilder &builder); + + bool isLoadChain(Operation *op) const; + + /// Assemble `pplForOp`'s yield op + void finalizeYield(OpBuilder &builder); + +public: + LoopPipeliner(scf::ForOp forOp) : forOp(forOp) { + yieldOp = cast(forOp.getBody()->getTerminator()); } - tt::MemDescType allocTy = cast(alloc.getType()); - SmallVector copyOffsets(allocTy.getRank(), zero); - copyOffsets[0] = insertIdx; - Operation *copy = builder.clone(*loadOp); - - auto [stage, cluster] = schedule[loadOp]; - schedule.erase(loadOp); - schedule.insert(copy, stage, cluster); - - // Extract part. - SmallVector loadOffsets(allocTy.getRank(), zero); - loadOffsets[0] = extractIdx; - Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - tt::MemDescType subviewTy = tt::MemDescType::get( - allocTy.getShape().drop_front(), allocTy.getElementType(), - allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); - auto viewLoad = - builder.create(loc, subviewTy, alloc, loadOffsets); - Operation *lds_store = - builder.create(loc, copy->getResult(0), viewLoad); - { - // Clean up old local caches. - SmallVector allocsToErase; - for (Operation *user : loadOp->getUsers()) { - if (auto alloc = dyn_cast(user)) { - alloc.replaceAllUsesWith(viewLoad.getResult()); - allocsToErase.push_back(alloc); - } - } - for (auto alloc : allocsToErase) { - alloc.erase(); - } + /// Collect loads to pipeline. Return success if we can pipeline this loop + LogicalResult initialize(); - auto sharedLoad = - builder.create(loc, loadOp.getType(), viewLoad); - auto result = sharedLoad->getResults(); + /// Emit pipelined loads (before loop body) + void emitPrologue(); - // Create a select for non-zero other values. - Value other = loadOp.getOther(); - if (other && !isZeroConst(other)) { - auto select = builder.create( - loc, loadOp.getType(), mask, sharedLoad.getResult(), other); - result = select->getResults(); - } + /// emit pipelined loads (after loop body) + void emitEpilogue(DenseMap &newResults); - loadOp->replaceAllUsesWith(result); + /// create the new ForOp (add new args & insert prefetched ops) + scf::ForOp createNewForOp(); + + friend struct PipelinePass; +}; - // Prefetch load if is used by the dot. - if (loadToInfo[loadOp].usedByDot) { - schedule.insert(lds_store, numStages - 2, prefetchCluster); - schedule.insert(viewLoad, numStages - 2, prefetchCluster); +void LoopPipeliner::collectValueDep(Value v, int stage, + SetVector &deps, + SetVector &args) { + // Since we only need to peel the loop numStages-1 times, don't worry + // about depends that are too far away + if (stage < 0) + return; + + // Loop-invariant value, skip + if (v.getParentRegion() != &forOp.getRegion()) + return; + + if (Operation *op = v.getDefiningOp()) { + if (!deps.contains(op)) { + deps.insert(op); + for (Value opr : op->getOperands()) + collectValueDep(opr, stage, deps, args); + } + } else if (auto arg = dyn_cast(v)) { + if (arg.getArgNumber() > 0) { + args.insert(arg); + collectValueDep(yieldOp->getOperand(arg.getArgNumber() - 1), stage - 1, + deps, args); } } - loadOp.erase(); } -// If all the transitive uses of the given value have are used by a convert to -// the same dot operand encoding, return true and get the shared encoding that -// needs to be used to be compatible with users' layouts. -static std::optional -getSharedEncIfAllUsersAreDotEnc(Value val) { - ttg::SharedEncodingAttr attr; - for (Operation *user : val.getUsers()) { - ttg::SharedEncodingAttr tempAttr; - if (user->getNumResults() != 1) - return std::nullopt; - if (auto memDesc = - dyn_cast(user->getResult(0).getType())) { - // First time we find a shared encoding in the chain, save it and try to - // use it if it is compatible with the other users. - tempAttr = cast(memDesc.getEncoding()); - if (!getSharedEncIfAllUsersAreDotEnc(user->getResult(0)).has_value()) - return std::nullopt; - } else { - if (!isa(user)) - return std::nullopt; - auto dotOpEnc = dyn_cast( - cast(user->getResult(0).getType()).getEncoding()); - if (!dotOpEnc) - return std::nullopt; - auto srcTy = cast(val.getType()); - auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); - auto order = ttg::getOrder(srcTy.getEncoding()); - unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); - tempAttr = ttg::SharedEncodingAttr::get( - val.getContext(), dotOpEnc, srcTy.getShape(), - ttg::getOrder(srcTy.getEncoding()), - ttg::getCTALayout(srcTy.getEncoding()), - srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false); +void LoopPipeliner::collectDeps( + SetVector &ops, + MapVector> &valueDeps) { + for (auto op : ops) { + for (Value v : op->getOperands()) { + SetVector deps; + SetVector args; + collectValueDep(v, numStages - 1, deps, args); + valueDeps[op] = deps; } - // Check that the shared encodings needed by the users are compatible. - if (!tempAttr || (attr != nullptr && attr != tempAttr)) - return std::nullopt; - attr = tempAttr; } - return attr; } -static ttg::BlockedEncodingAttr -getBlockedEncoding(tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfo) { - Value src = loadOp.getPtr(); - auto ty = cast(src.getType()); - auto mod = loadOp->getParentOfType(); - int numWarps = ttg::TritonGPUDialect::getNumWarps(mod); - int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod); - tt::AxisInfo::DimVectorT contiguity = - axisInfo.getAxisInfo(src)->getContiguity(); - SmallVector order = argSort(contiguity); - unsigned currPerThread = getNumElementsPerThread(loadOp, order, axisInfo); - SmallVector sizePerThread(order.size(), 1); - sizePerThread[order[0]] = currPerThread; - ttg::CTALayoutAttr ctaLayout = ttg::getCTALayout(ty.getEncoding()); - return ttg::BlockedEncodingAttr::get(loadOp->getContext(), ty.getShape(), - sizePerThread, order, numWarps, - threadsPerWarp, ctaLayout); -} +LogicalResult LoopPipeliner::checkOpUses() { + SetVector ops; + // We cannot use forOp.walk(...) here because we only want to visit the + // operations in the loop body block. Nested blocks are handled separately. + for (Operation &op : forOp) { + if (auto loadOp = dyn_cast(&op)) + ops.insert(&op); + } -// Create a map from load ops to their indirection level and the -// final use of the load op (another load op, or a dot op). -// Indirection level is "0" for the load op directly used by the dot op, -// "1" for the load op used by the load op used by the dot op, and so on. -static llvm::SmallVector> -loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { - llvm::SmallVector> - loadOpToIndLevelAndUse; - DenseSet seen; - - std::function dfs = - [&](Operation *op, int distance, Operation *use) { - if (!seen.insert(op).second) - return; - if (isa(op)) { - // TODO: What if there are multiple uses at different distances? - loadOpToIndLevelAndUse.push_back(std::make_tuple(op, distance, use)); - use = op; - distance++; + // Collect all ops' dependencies + MapVector> opDeps; + collectDeps(ops, opDeps); + + for (Operation *op : ops) { + auto loadOp = dyn_cast(op); + // Don't pipeline valid loads that depend on other valid loads + // (Because if a valid load depends on another valid load, this load needs + // to wait on the other load in the prologue, which is against the point + // of the pipeline pass) + bool isCandidate = true; + for (Operation *other : ops) + if (isa(other)) + if (opDeps[op].contains(other)) { + isCandidate = false; + break; } - for (Value operand : op->getOperands()) { - Value v = operand; - Operation *defOp = v.getDefiningOp(); - if (defOp && defOp->getBlock() == op->getBlock()) { - dfs(defOp, distance, use); - } - } - }; + // We only pipeline loads that have one covert_layout (to dot_op) use + // TODO: lift this constraint in the future + if (isCandidate && loadOp.getResult().hasOneUse()) { + isCandidate = false; + Operation *use = *loadOp.getResult().getUsers().begin(); + + // Advance to the first conversion as long as the use resides in shared + // memory and it has a single use itself + while (use) { + if (use->getNumResults() != 1 || !use->getResult(0).hasOneUse()) + break; + auto tensorType = + dyn_cast(use->getResult(0).getType()); + if (!tensorType || + !isa(tensorType.getEncoding())) + break; + use = *use->getResult(0).getUsers().begin(); + } - for (Operation &op : forOp.getBody()->without_terminator()) { - if (!op.hasTrait()) - continue; - seen.clear(); - dfs(&op, 0, &op); - } + // TODO: handle fp_to_fp conversions in between + if (auto convertLayout = llvm::dyn_cast(use)) + if (auto tensorType = + dyn_cast(convertLayout.getResult().getType())) + if (auto dotOpEnc = dyn_cast( + tensorType.getEncoding())) { + isCandidate = true; + convertMapping[loadOp] = convertLayout; + } + } else + isCandidate = false; - // If the loop has numStages attribute, also consider pipelining other loads - // that are not directly used by dot ops. - if (forOp->hasAttr(tt::kNumStagesAttrName)) { - for (Operation &op : forOp.getBody()->without_terminator()) { - if (!isa(op)) - dfs(&op, 0, &op); - } + if (isCandidate) + validLoads.insert(op); } - return loadOpToIndLevelAndUse; + return validLoads.empty() ? failure() : success(); } -static llvm::MapVector -assignMemoryLayouts(llvm::SmallVector> - &loadOpToIndLevelAndUse, - tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) { - llvm::MapVector loadToInfo; - - for (auto &[op, dist, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(op)) - // TODO pawel: err, we'd need to verify that the distance is the same - continue; - LoadInfo loadInfo; - - auto loadOp = dyn_cast(op); - assert(!isLoadFromTensorPtr(loadOp) && - "Block ptr should have been lowered before this pass."); - auto ptr = loadOp.getPtr(); - unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); - if (auto mask = loadOp.getMask()) - vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); - - auto tensorTy = dyn_cast(ptr.getType()); - if (!tensorTy) - continue; - - auto ty = cast(tensorTy.getElementType()).getPointeeType(); - unsigned width = vec * ty.getIntOrFloatBitWidth(); - - // Limit shared memory sharing to width >= 32 elements. - LDBG("Load " << *loadOp << " has width " << width); - if (width < 32) - continue; - - if (use->hasTrait()) { - // Only use shared memory when feeding a dot op - loadInfo.usedByDot = true; - loadInfo.sharedEncoding = - getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr); - } else if (auto loadOp = dyn_cast(use)) { - // The use of this loadOp is another loadOp. If the use is not in the - // loadsToPipeline already, it means that the use is not valid for - // pipelining for some reason. We should skip this loadOp, too. Note that - // we have an assumption that distAndUse.second (i.e. the use of this - // loadOp) has already be processed in a previous loop iteration. This - // assumption is held by how loadOpsToIndirectionLevelAndUse recursively - // collects loadOpToIndLevelAndUse using DFS. - if (loadToInfo.count(loadOp) == 0) { - continue; +LogicalResult LoopPipeliner::checkOpDeps() { + /// arg => source operand defined stages + DenseMap> immediateArgStages; + SetVector nonImmediateDepArgs; + SetVector nonImmediateOps; + for (Operation *op : validLoads) { + for (Value v : op->getOperands()) { + SetVector deps; + SetVector args; + collectValueDep(v, numStages - 1, deps, args); + int defStage = getValueDefStage(v, numStages - 1); + if (defStage < 0) { + // assert(defStage >= 0 && + // "newLoopArgs has null args without a define op. Consider + // either " "rewrite the loop to reduce cross iteration + // dependencies or " "increase the num_stages value."); + return failure(); + } + bool immediate = args.size() > 0; + for (auto *dep : deps) { + depOps.insert(dep); + if (immediate) + immediateOpStages[dep].insert(defStage); + else + nonImmediateOps.insert(dep); + } + for (auto arg : args) { + depArgs.insert(arg); + if (immediate) + immediateArgStages[arg].insert(defStage); + else + nonImmediateDepArgs.insert(arg); } } + } - // If we still don't have a shared encoding, try a "generic" shared - // encoding. - if (!loadInfo.sharedEncoding) { - // Also pipeline in-register buffers. - loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); - } + // XXX: We could remove the following constraints if we can rematerialize in + // the loop. + // Check if immediateDepArgs and nonImmediateDepArgs are disjoint. + for (auto &[arg, stages] : immediateArgStages) { + assert(stages.size() == 1 && + "Triton doesn't support an argument provides values for " + "immediate operands of loads from multiple stages. Consider " + "removing post load instructions dependency on this argument."); + assert(!(nonImmediateDepArgs.contains(arg) && + stages.contains(numStages - 2)) && + "Loop-carried arguments provide values for both immediate and " + "non-immediate operands of loads. Please consider removing " + "pre/post load instructions dependency on this argument."); + } - loadToInfo[op] = loadInfo; + // Check if immediateOps and nonImmediateOps are disjoint. + for (auto &[op, stages] : immediateOpStages) { + assert(stages.size() == 1 && + "Triton doesn't support an operation provides values for " + "immediate operands of loads from multiple stages. Consider " + "removing post load instructions dependency on this argument."); + assert(!(nonImmediateOps.contains(op) && stages.contains(numStages - 2)) && + "Operations provide values for both immediate and " + "non-immediate operands of loads. Please consider " + "removing pre/post load instructions dependency on this " + "operation."); } + return success(); +} - return loadToInfo; +// helpers +void LoopPipeliner::setValueMapping(Value origin, Value newValue, int stage) { + if (valueMapping.find(origin) == valueMapping.end()) + valueMapping[origin] = SmallVector(numStages); + valueMapping[origin][stage] = newValue; } -static llvm::MapVector -scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, - DenseSet &rootUsers, int numStages) { - ModuleOp moduleOp = forOp->getParentOfType(); - tt::ModuleAxisInfoAnalysis axisInfoAnalysis(moduleOp); - - // Get all loads that are (transitively) used by dot ops and their distance - // to the dot op. - llvm::SmallVector> - loadOpToIndLevelAndUse = loadOpsToIndirectionLevelAndUse(forOp); - LLVM_DEBUG({ - LDBG("Found " << loadOpToIndLevelAndUse.size() << " loads to pipeline:"); - for (const auto &[l, i, u] : loadOpToIndLevelAndUse) { - LDBG(" - load: " << *l); - LDBG(" at indirection level: " << i); - LDBG(" used by op: " << *u); +void LoopPipeliner::setValueMappingYield(Value origin, Value newValue, + int stage) { + for (OpOperand &operand : origin.getUses()) { + if (operand.getOwner() == yieldOp) { + auto yieldIdx = operand.getOperandNumber(); + auto value = forOp.getRegionIterArgs()[yieldIdx]; + setValueMapping(value, newValue, stage); } - }); - if (loadOpToIndLevelAndUse.empty()) - return {}; - - // Check which loads are good for pipelining, and assign them - // memory layouts. - llvm::MapVector loadToInfo = - assignMemoryLayouts(loadOpToIndLevelAndUse, axisInfoAnalysis); - - if (loadToInfo.empty()) - return {}; - - // Calculate the stage distance between applicable loads. - int maxIndirectionLevel = -1; - for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; - maxIndirectionLevel = std::max(maxIndirectionLevel, dist); } - unsigned stagesBetweenLoads = - ceil(numStages - 2, maxIndirectionLevel + 1); - - tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); - // Put the root uses of the loads in the last stage. - for (auto &[loadOp, dist, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; - // Non-LoadOp(s) are the root uses of all LoadOp(s) and should be - // always present in the opInfo - if (!isa(use)) { - schedule.insert(use, numStages - 1, rootUsersCluster); - rootUsers.insert(use); +} + +void LoopPipeliner::setValueMappingYield(Value origin, Value newValue) { + for (OpOperand &operand : origin.getUses()) { + if (operand.getOwner() == yieldOp) { + auto yieldIdx = operand.getOperandNumber(); + auto depYieldIdx = depArgsIdx[forOp.getRegionIterArgs()[yieldIdx]]; + auto originArg = forOp.getRegionIterArgs()[yieldIdx]; + nextMapping.map(originArg, newValue); + auto newArg = pplForOp.getRegionIterArgs()[depYieldIdx]; + if (!depArgsMapping.contains(newArg)) + depArgsMapping[newArg] = newValue; } } +} - SmallVector loadsClusters; - for (int i = 0; i < maxIndirectionLevel + 1; i++) { - loadsClusters.push_back(schedule.clusters.newAtBack()); - } - // Assign stages to the loads. - for (auto [loadOp, indLevel, _] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; - int stage = (maxIndirectionLevel - indLevel) * stagesBetweenLoads; - schedule.insert(loadOp, stage, loadsClusters[indLevel]); +Value LoopPipeliner::lookupOrDefault(Value origin, int stage) { + if (valueMapping.find(origin) == valueMapping.end()) + return origin; + return valueMapping[origin][stage]; +} + +void LoopPipeliner::createBufferTypes() { + for (auto loadCvt : convertMapping) { + auto loadOp = loadCvt.first; + Value cvt = loadCvt.second; + auto dotOpEnc = cast( + cast(cvt.getType()).getEncoding()); + auto ty = cast(loadOp.getType()); + SmallVector bufferShape(ty.getShape().begin(), + ty.getShape().end()); + Type eType = ty.getElementType(); + auto blockedEnc = cast(ty.getEncoding()); + auto CTALayout = ttg::getCTALayout(ty.getEncoding()); + // unsigned bitWidth = dotOpEnc.getMMAv2kWidth() + // ? 32 / dotOpEnc.getMMAv2kWidth() + // : ty.getElementType().getIntOrFloatBitWidth(); + auto sharedEnc = ttg::SharedEncodingAttr::get( + ty.getContext(), dotOpEnc, ty.getShape(), + ttg::getOrder(ty.getEncoding()), CTALayout, eType); + loadsBufferType[loadOp] = triton::MemDescType::get( + bufferShape, eType, sharedEnc, + triton::gpu::SharedMemorySpaceAttr::get(ty.getContext()), + /*mutableMemory=*/true); } +} - // Distance from the load to the use. - for (auto [loadOp, _, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; - loadToInfo[loadOp].distToUse = schedule[use].first - schedule[loadOp].first; +void LoopPipeliner::createOrderedDeps() { + for (Operation &op : forOp.getBody()->without_terminator()) { + if (depOps.contains(&op)) + orderedDeps.push_back(&op); + else if (op.getNumResults() > 0 && validLoads.contains(&op)) + orderedDeps.push_back(&op); } + assert(depOps.size() + validLoads.size() == orderedDeps.size() && + "depOps contains invalid values"); +} - return loadToInfo; +void LoopPipeliner::collectDepChain(Operation *op, + SetVector &ops) { + if (op->getNumResults() == 1 && validLoads.contains(op)) + return; + if (!ops.contains(op)) { + ops.insert(op); + for (Value opr : op->getOperands()) + if (Operation *oprOp = opr.getDefiningOp()) + collectDepChain(oprOp, ops); + } } -// Add dependencies of anchor ops to the coarse schedule. Schedule them to -// the same stage and ordering cluster as the anchor op. -static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, - int numStages) { - SmallVector> - opsInOrder = schedule.getOpsInOrder(forOp); - // Schedule dependencies stage by stage. - for (int stage = 0; stage < numStages; stage++) { - for (auto [op, stage_, cluster] : opsInOrder) { - if (stage_ != stage) - continue; - schedule.insertDepsOfOp(op, stage, cluster, false); - } +void LoopPipeliner::createCurrentDeps() { + for (Operation &op : forOp.getBody()->without_terminator()) { + if (!llvm::is_contained(orderedDeps, &op)) + collectDepChain(&op, currentDeps); } } -// Find dependencies with distance of 1. They will go to the next stage, -// but in the cluster before the current op. -static void scheduleDistanceOneDependencies(scf::ForOp forOp, - tt::CoarseSchedule &schedule, - int numStages) { - auto getNestedOperands = [](Operation *op) -> SmallVector { - SmallVector operands; - op->walk([&](Operation *nestedOp) { - for (Value operand : nestedOp->getOperands()) { - if (operand.getParentBlock()->getParentOp()->isAncestor(nestedOp)) - operands.push_back(operand); +int LoopPipeliner::getValueDefStage(Value v, int stage) { + if (stage < 0) + return -1; + if (auto arg = dyn_cast(v)) { + if (arg.getArgNumber() > 0) + return getValueDefStage(yieldOp->getOperand(arg.getArgNumber() - 1), + stage - 1); + llvm_unreachable("Loop induction variable should not be a dependency"); + } else + return stage; +} + +LogicalResult LoopPipeliner::initialize() { + if (checkOpUses().failed()) + return failure(); + + if (checkOpDeps().failed()) + return failure(); + + createBufferTypes(); + + createOrderedDeps(); + + createCurrentDeps(); + + return success(); +} + +Value LoopPipeliner::getLoadMask(triton::LoadOp loadOp, Value mappedMask, + Value loopCond, OpBuilder &builder) { + if (!peelLastIter) { + // add mask for last iteration when not peeled to epilogue + Value mask = loadOp.getMask(); + Type maskType = triton::getI1SameShape(loadOp.getType()); + Value newMask; + if (mask) { + Value cond = loopCond; + if (isa(maskType)) { + cond = + builder.create(mask.getLoc(), maskType, loopCond); } - }); - return operands; - }; - - // Mapping from the cluster to the cluster before it. - DenseMap - dist1Cluster; - for (auto &op : forOp.getBody()->without_terminator()) { - if (schedule.count(&op) == 0) - continue; - auto [stage, cluster] = schedule[&op]; - // Can't schedule past the last stage. - if (stage == numStages - 1) - continue; - for (Value operand : getNestedOperands(&op)) { - if (auto arg = dyn_cast(operand)) { - if (arg.getArgNumber() > 0 && arg.getOwner() == op.getBlock()) { - auto yieldOp = op.getBlock()->getTerminator(); - Value v = yieldOp->getOperand(arg.getArgNumber() - 1); - Operation *defOp = v.getDefiningOp(); - if (defOp && schedule.count(defOp) == 0) { - if (isa(defOp)) { - // Exception: Schedule loads with a distance of 1 together - // with the current op. - schedule.insertIfAbsent(defOp, stage, cluster); - schedule.insertDepsOfOp(defOp, stage, cluster, true); - } else { - if (dist1Cluster.count(&cluster) == 0) { - dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster); - } - schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]); - schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster], - true); - } - } - } + newMask = builder.create(mask.getLoc(), mappedMask, cond); + } else { + if (isa(maskType)) { + newMask = builder.create(loopCond.getLoc(), maskType, + loopCond); + } else { + newMask = loopCond; } } + return newMask; } + // use original mask when peeling last iteration bc the loop will not do + // extra loads for the tail of the pipeline + return mappedMask; } -static void -scheduleRemainingToLastStage(scf::ForOp forOp, tt::CoarseSchedule &schedule, - tt::CoarseSchedule::Cluster afterPrologue, - int numStages) { - // Assign the rest of the ops to the last stage. - // Take care of the ordering of the ops - uses cannot be scheduled to the - // cluster before the definition. - DenseMap opToCluster; - for (auto &op : forOp.getBody()->without_terminator()) { - if (schedule.count(&op) == 0) { - opToCluster[&op] = afterPrologue; +bool LoopPipeliner::isLoadChain(Operation *op) const { + if (auto cvtOp = dyn_cast(op)) { + Value loadVal = cvtOp.getSrc(); + if (auto f2fOp = dyn_cast(op)) + loadVal = f2fOp.getSrc(); + if (validLoads.contains(loadVal.getDefiningOp())) { + if (isa(cvtOp.getType().getEncoding())) + return true; } } - SmallVector queue; - for (auto [op, stage, cluster] : schedule.getOpsInOrder(forOp)) { - // We really only care about the producers from the last stage. - // Others will be scheduled before these ops anyway. - if (stage == numStages - 1) { - queue.push_back(op); - } + return false; +} + +void LoopPipeliner::emitPrologue() { + /// forOp block args => forOp operands + /// forOp iterator => lower bound + IRMapping prologueMap; + OpBuilder builder(forOp); + // Get init operands for loop carried values + for (BlockArgument &arg : forOp.getRegionIterArgs()) { + OpOperand &operand = *forOp.getTiedLoopInit(arg); + prologueMap.map(arg, operand.get()); } - while (!queue.empty()) { - Operation *op = queue.pop_back_val(); - for (auto user : op->getUsers()) { - if (opToCluster.count(user)) { - tt::CoarseSchedule::Cluster userCluster = opToCluster[user]; - tt::CoarseSchedule::Cluster opCluster = schedule[op].second; - if (*userCluster < *opCluster) { - opToCluster[user] = opCluster; - queue.push_back(user); + + // Emit prologue + // Map IV to lower bound + prologueMap.map(forOp.getInductionVar(), forOp.getLowerBound()); + + // Emit Iteration 0 loads, etc + for (Operation *op : orderedDeps) { + Operation *newOp = nullptr; + if (validLoads.contains(op)) { + auto loadOp = cast(op); + // Load from global -> regs + auto newLoadOp = cloneWithInferType(builder, op, prologueMap); + Value loadVal = newLoadOp->getResult(0); + // Convert from regs to shared mem + newOp = builder.create( + loadOp.getLoc(), loadsBufferType[loadOp], loadVal); + Value cvtVal = newOp->getResult(0); + prologueMap.map(loadOp->getResult(0), cvtVal); + loadsBuffer[op] = cvtVal; + } else { + newOp = cloneWithInferType(builder, op, prologueMap); + } + // Capture loop carried results for pipelined for input + for (unsigned idx : llvm::seq(unsigned(0), op->getNumResults())) + setValueMappingYield(op->getResult(idx), newOp->getResult(idx), 1); + } // for (Operation *op : orderedDeps) +} + +void LoopPipeliner::emitEpilogue(DenseMap &newResults) { + if (!peelLastIter) + return; + OpBuilder builder(pplForOp); + builder.setInsertionPointAfter(pplForOp); + + IRMapping epilogueMap; + // Map 'for' iteration args to pipelined-for results + auto args = forOp.getRegionIterArgs(); + for (uint32_t i = 0; i < args.size(); ++i) + epilogueMap.map(args[i], pplForOp.getResult(i)); + for (auto *loadOp : validLoads) + epilogueMap.map(loadOp->getResult(0), loadsBuffer[loadOp]); + + // This is computing the upper bound of the pipelined loop as: + // pplUpperBound = lb+((ub-1-lb)/step)*step + Location loc = forOp.getLoc(); + Value ub = forOp.getUpperBound(); + Value lb = forOp.getLowerBound(); + Value step = forOp.getStep(); + Value one = builder.create(loc, 1, 32); + + // pplRange = ub-1-lb + Value pplRange = builder.create( + loc, builder.create(loc, ub, one), lb); + + // pplIters = (pplrRange/step)*step + Value pplIters = builder.create( + loc, builder.create(loc, pplRange, step), step); + + // pplUpperBound = lb+pplIters + Value pplUpperBound = builder.create(loc, lb, pplIters); + epilogueMap.map(forOp.getInductionVar(), pplUpperBound); + + const auto &yieldOprs = yieldOp.getOperands(); + // Clone the loop body after the new ForOp + // , replace original args with results of the new ForOp. + for (Operation &op : forOp.getBody()->without_terminator()) { + if (currentDeps.contains(&op)) { + Operation *newOp = nullptr; + if (isLoadChain(&op)) { + if (auto cvt = dyn_cast(&op)) { + Value mappedValue = epilogueMap.lookup(cvt.getSrc()); + if (isa(mappedValue.getType())) { + auto newCvt = builder.create( + cvt.getLoc(), cvt.getType(), mappedValue); + epilogueMap.map(cvt.getResult(), newCvt); + newOp = newCvt; + } + } + if (!newOp) + newOp = builder.clone(op, epilogueMap); + } else { + newOp = cloneWithInferType(builder, &op, epilogueMap); + } + // substitute for these results for the results of the new for loop + for (const auto &pair : llvm::zip(op.getResults(), newOp->getResults())) { + auto val = std::get<0>(pair); + auto it = llvm::find(yieldOprs, val); + if (it != yieldOprs.end()) { + uint32_t idx = std::distance(yieldOprs.begin(), it); + newResults[forOp->getResult(idx)] = std::get<1>(pair); } } } } - for (auto [op, cluster] : opToCluster) { - schedule.insert(op, numStages - 1, cluster); - } } -// Create an allocation that can hold distance number of loadOp shapes. -static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, - ttg::SharedEncodingAttr sharedEnc, unsigned distance) { - OpBuilder builder(forOp); - Attribute sharedMemorySpace = - triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - auto ty = cast(loadOp->getResultTypes()[0]); - SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); - bufferShape.insert(bufferShape.begin(), distance); - Type memdescType = mlir::triton::MemDescType::get( - bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace, - /*mutableMemory*/ true); - Value alloc = builder.create( - loadOp->getLoc(), memdescType, Value()); - return alloc; -} +SmallVector LoopPipeliner::collectNewLoopArgs() { + // Order of new args: + // (original args) + // (shared mem buffers for each load) + // (depArgs at stage numStages - 1) + + // We need this to update operands for yield + // original block arg => new arg's idx + SmallVector newLoopArgs; + for (auto v : forOp.getInitArgs()) { + newLoopArgs.push_back(lookupOrDefault(v, numStages - 1)); /*1*/ + } -// Convert load ops into their asyn version and apply multi-buffering based on -// the required number of buffers. -static SmallVector -createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, - llvm::MapVector &loadToInfo, - int numStages) { - // Calculate the number of buffers needed for each load. - // TODO pawel: we could do more fine-grained allocation here and - // allocate only the number of buffers that specific loads need. - // Instead, we allocate the maximum number of buffers needed by any load. - int numBuffers = - llvm::max_element(llvm::make_second_range(loadToInfo), [](auto &lhs, - auto &rhs) { - return lhs.distToUse < rhs.distToUse; - })->distToUse; - - SmallVector> asyncLoads; - SmallVector allocs; - for (auto &[loadOp, info] : loadToInfo) { - // assert(info.sharedEncoding && "LoadOp shared encoding not defined."); - if (info.sharedEncoding) { - Value alloc = createAlloc(forOp, loadOp, info.sharedEncoding, numBuffers); - assert(alloc && "Failed to create alloc for the async load."); - allocs.push_back(alloc); - asyncLoads.emplace_back(loadOp, alloc); - } + // Loop carried vals + depArgsBeginIdx = newLoopArgs.size(); + for (auto depArg : depArgs) { + depArgsIdx[depArg] = newLoopArgs.size(); + newLoopArgs.push_back(valueMapping[depArg][numStages - 1]); /*1*/ } - IRRewriter builder(forOp.getContext()); - builder.setInsertionPoint(forOp); + return newLoopArgs; +} - Location loc = forOp.getLoc(); - // Create two new counters to index into the allocs. - Value minusOne = builder.create(loc, -1, 32); - Value zero = builder.create(loc, 0, 32); - Value one = builder.create(loc, 1, 32); - Value insertIdx = minusOne; - Value extractIdx = minusOne; - Value phase = Value(); - Value numBuffersVal = - builder.create(loc, numBuffers, 32); - SmallVector newOperands; - newOperands.push_back(insertIdx); - newOperands.push_back(extractIdx); - - unsigned newOperandIndex = forOp.getBody()->getNumArguments(); - // Patch the loop to add the new loop carried dependencies. - scf::ForOp newForOp = - replaceForOpWithNewSignature(builder, forOp, newOperands); - forOp.erase(); - forOp = newForOp; - insertIdx = newForOp.getBody()->getArgument(newOperandIndex); - extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); - if (phase) { - phase = newForOp.getBody()->getArgument(newOperandIndex + 2); +scf::ForOp LoopPipeliner::cloneForOp(ArrayRef newLoopArgs, + OpBuilder &builder) { + auto loc = forOp.getLoc(); + // Peel off the last iteration + auto pplUpperBound = forOp.getUpperBound(); + if (peelLastIter) + pplUpperBound = + builder.create(loc, pplUpperBound, forOp.getStep()); + + // Clone the original ForOp + pplForOp = builder.create( + loc, forOp.getLowerBound(), pplUpperBound, forOp.getStep(), newLoopArgs); + + // Set mapping on body of the new ForOp + builder.setInsertionPointToStart(pplForOp.getBody()); + for (const auto &arg : llvm::enumerate(forOp.getRegionIterArgs())) + curMapping.map(arg.value(), pplForOp.getRegionIterArgs()[arg.index()]); + for (auto *loadOp : validLoads) + curMapping.map(loadOp->getResult(0), loadsBuffer[loadOp]); + curMapping.map(forOp.getInductionVar(), pplForOp.getInductionVar()); + + nextMapping = curMapping; + // Map the dep args of the next iteration to the dep args of the current + auto iterArgs = pplForOp.getRegionIterArgs(); + size_t argIdx = 0; + for (auto depArg : depArgs) { + BlockArgument nextArg = iterArgs[argIdx + depArgsBeginIdx]; + nextMapping.map(depArg, nextArg); + ++argIdx; } - // Create two counters for the insert and extract indices to avoid creating - // long liverange. - builder.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); - insertIdx = builder.create(loc, insertIdx, one); - Value cndIns = builder.create(loc, arith::CmpIPredicate::slt, - insertIdx, numBuffersVal); - insertIdx = builder.create(loc, cndIns, insertIdx, zero); - - extractIdx = builder.create(loc, extractIdx, one); - Value cndExt = builder.create(loc, arith::CmpIPredicate::slt, - extractIdx, numBuffersVal); - extractIdx = builder.create(loc, cndExt, extractIdx, zero); - if (phase) { - Value nextPhase = builder.create(loc, phase, one); - phase = builder.create(loc, cndExt, phase, nextPhase); + // Compute next IV for pre-loads + Value iv = pplForOp.getInductionVar(); + curMapping.map(forOp.getInductionVar(), iv); + Value nextIV = + builder.create(iv.getLoc(), iv, pplForOp.getStep()); + nextMapping.map(forOp.getInductionVar(), nextIV); + nextLoopCond = + builder.create(nextIV.getLoc(), arith::CmpIPredicate::slt, + nextIV, pplForOp.getUpperBound()); + + return pplForOp; +} + +void LoopPipeliner::updateLoadMask(triton::LoadOp loadOp, Value newMask) { + if (newMask) { + if (loadOp->getNumOperands() > 1) + loadOp->setOperand(1, newMask); + else { + auto mask = loadOp.getMaskMutable(); + mask.assign(newMask); + } } +} - // Create a cluster for the prefetches. It may end up being empty, but this - // is OK. - tt::CoarseSchedule::Cluster prefetchCluster = schedule.clusters.newAtBack(); +void LoopPipeliner::prefetchNextBuffer(OpBuilder &builder) { + // Emit prefetch loads of next buffer before compute of current buffer + for (Operation *op : orderedDeps) { + Operation *nextOp = nullptr; + if (validLoads.contains(op)) { + // Update loading mask + auto loadOp = llvm::cast(op); + auto mask = loadOp.getMask(); + // pre-load global -> regs + Value newMask = getLoadMask(loadOp, nextMapping.lookupOrDefault(mask), + nextLoopCond, builder); + if (mask) { + // If mask is defined outside the loop, don't update the map more than + // once + if (!(forOp.isDefinedOutsideOfLoop(mask) && nextMapping.contains(mask))) + nextMapping.map(loadOp.getMask(), newMask); + newMask = nextMapping.lookupOrDefault(mask); + } + auto newOp = builder.clone(*op, nextMapping); + updateLoadMask(cast(newOp), newMask); + } else if (!immediateOpStages[op].contains(numStages - 2)) { + Operation *nextOp = builder.clone(*op, nextMapping); + if (auto loadOp = dyn_cast(op)) { + if (auto newMask = getLoadMask( + loadOp, nextMapping.lookupOrDefault(loadOp.getMask()), + nextLoopCond, builder)) { + updateLoadMask(cast(nextOp), newMask); + } + } - for (auto &pair : asyncLoads) { - if (auto loadOp = dyn_cast(pair.first)) { - createStreamCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, - schedule, prefetchCluster, loadToInfo, numStages); + for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) + nextMapping.map(op->getResult(dstIdx), nextOp->getResult(dstIdx)); + for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) + setValueMappingYield(op->getResult(dstIdx), nextOp->getResult(dstIdx)); } } - SmallVector newYieldOperands = {insertIdx, extractIdx}; - if (phase) - newYieldOperands.push_back(phase); - // Patch the yield with the updated counters. - appendToYield(forOp, newYieldOperands); - - return allocs; } -static bool -preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, - mlir::triton::PipeliningOption &options) { - // Schedule the loads and root ops (dot ops) in the loop. This will give us - // a scaffold for the final schedule. - DenseSet rootUsers; - tt::CoarseSchedule coarseSchedule(numStages); - llvm::MapVector loadToInfo = - scheduleLoads(forOp, coarseSchedule, rootUsers, numStages); - if (loadToInfo.empty()) - return false; - - LLVM_DEBUG({ - LDBG("Coarse schedule loads only:"); - coarseSchedule.dump(); - }); - - // Convert the loads into async loads and create the allocs. - SmallVector allocs = - createStreamOps(forOp, coarseSchedule, loadToInfo, numStages); - - LLVM_DEBUG({ - LDBG("Coarse schedule with stream loads:"); - coarseSchedule.dump(); - }); - - tt::CoarseSchedule::Cluster afterPrologue = coarseSchedule.clusters.begin(); - - scheduleDependencies(forOp, coarseSchedule, numStages); - LLVM_DEBUG({ - LDBG("Coarse schedule with dependencies:"); - coarseSchedule.dump(); - }); - - scheduleDistanceOneDependencies(forOp, coarseSchedule, numStages); - LLVM_DEBUG({ - LDBG("Coarse schedule with dist 1:"); - coarseSchedule.dump(); - }); - - scheduleRemainingToLastStage(forOp, coarseSchedule, afterPrologue, numStages); - LLVM_DEBUG({ - LDBG("Final coarse schedule:"); - coarseSchedule.dump(); - }); - - // Create the final schedule for the kernel loop. This will dictate the - // stages and order of operations to the pipeline expander. - std::vector> schedule = - coarseSchedule.createFinalSchedule(forOp); - - // Fill out the pipeline options. - options.getScheduleFn = - [schedule](scf::ForOp forOp, - std::vector> &s) { - s = std::move(schedule); - }; - options.peelEpilogue = false; - options.predicateFn = tt::predicateOp; - options.supportDynamicLoops = true; - options.annotateFn = [](Operation *op, - mlir::triton::PipeliningOption::PipelinerPart part, - unsigned iteration) {}; - // Insert a wait 0 after the loop - OpBuilder builder(forOp); - builder.setInsertionPointAfter(forOp); - // Explicitly deallocate allocated tensors after the wait op - for (auto alloc : allocs) - builder.create(forOp.getLoc(), alloc); - return true; +void LoopPipeliner::cloneCurrentBody(OpBuilder &builder) { + auto loc = forOp.getLoc(); + // only add instructions that are not part of the restructuring + for (Operation &op : forOp.getBody()->without_terminator()) { + if (currentDeps.contains(&op)) { + Operation *newOp = nullptr; + if (isLoadChain(&op)) { + if (auto cvt = dyn_cast(&op)) { + Value mappedValue = curMapping.lookup(cvt.getSrc()); + if (isa(mappedValue.getType())) { + auto newCvt = builder.create( + cvt.getLoc(), cvt.getType(), mappedValue); + curMapping.map(cvt.getResult(), newCvt); + newOp = newCvt; + } + } + if (!newOp) + newOp = builder.clone(op, curMapping); + } else { + newOp = cloneWithInferType(builder, &op, curMapping); + } + } + } } -// Return true if the preconditions for pipelining the loop are met. -static bool preConditionInner(scf::ForOp forOp) { - // Skip loop with distance > 1 for now. - // TODO: relax the constraint in the expander. - if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), - [](Value operand) { - Operation *def = operand.getDefiningOp(); - return !def; - })) - return false; - // Don't pipeline outer loops. - if (forOp - ->walk([&](Operation *op) { - if (forOp.getOperation() == op) - return WalkResult::advance(); - if (isa(op)) - return WalkResult::interrupt(); - return WalkResult::advance(); - }) - .wasInterrupted()) - return false; - return true; -} +void LoopPipeliner::storeNextBuffer(OpBuilder &builder) { + // Store the next buffer at the end of the loop body for the next iteration + for (Operation *op : orderedDeps) { + if (!validLoads.contains(op)) { + if (immediateOpStages[op].contains(numStages - 2)) { + Operation *nextOp = builder.clone(*op, nextMapping); + if (auto loadOp = dyn_cast(op)) { + auto newMask = + getLoadMask(loadOp, nextMapping.lookupOrDefault(loadOp.getMask()), + nextLoopCond, builder); + updateLoadMask(cast(nextOp), newMask); + } -static bool pipelineLoop(scf::ForOp forOp, int numStages) { - mlir::triton::PipeliningOption options; - if (!preConditionInner(forOp)) - return false; + for (unsigned dstIdx : llvm::seq(unsigned(0), op->getNumResults())) + setValueMappingYield(op->getResult(dstIdx), + nextOp->getResult(dstIdx)); + } + } + } - bool foundSchedule = false; - foundSchedule = preProcessLoopAndGetSchedule2(forOp, numStages, options); + // PL loads -> store next to shared + for (auto *loadOp : validLoads) { + Value loadVal = nextMapping.lookup(loadOp->getResult(0)); + // then store regs -> shared + Value storeBuf = loadsBuffer[loadOp]; + builder.create(loadOp->getLoc(), loadVal, storeBuf); + } - // TODO: add more pipelines strategy. - if (!foundSchedule) - return false; + // Some values have not been used by any ops in the loop body + for (BlockArgument arg : forOp.getRegionIterArgs()) + setValueMappingYield(arg, pplForOp.getRegionIterArgs()[depArgsIdx[arg]]); +} - IRRewriter rewriter(forOp->getContext()); - rewriter.setInsertionPoint(forOp); - FailureOr newForOp = - mlir::triton::pipelineForLoop(rewriter, forOp, options); +void LoopPipeliner::finalizeYield(OpBuilder &builder) { + SmallVector yieldValues; + for (const auto &opr : llvm::enumerate(yieldOp->getOperands())) { + if (curMapping.contains(opr.value())) + yieldValues.push_back(curMapping.lookup(opr.value())); + else + yieldValues.push_back(pplForOp.getRegionIterArgs()[opr.index()]); + } + for (size_t i = 0; i < depArgsMapping.size(); ++i) { + auto arg = pplForOp.getRegionIterArgs()[depArgsBeginIdx + i]; + assert(depArgsMapping.count(arg) && "Missing loop-carried value"); + yieldValues.push_back(depArgsMapping[arg]); + } - if (failed(newForOp)) - return false; - return true; + builder.setInsertionPointToEnd(pplForOp.getBody()); + builder.create(yieldOp->getLoc(), yieldValues); } -namespace { +scf::ForOp LoopPipeliner::createNewForOp() { + OpBuilder builder(forOp); + auto newLoopArgs = collectNewLoopArgs(); + cloneForOp(newLoopArgs, builder); + prefetchNextBuffer(builder); + cloneCurrentBody(builder); + storeNextBuffer(builder); + finalizeYield(builder); + return pplForOp; +} + +// Stream Pipeline struct PipelinePass : public TritonAMDGPUStreamPipelineBase { PipelinePass() = default; - PipelinePass(int32_t numStages) { this->numStages = numStages; } - - int getNumStagesOrDefault(scf::ForOp forOp) { - // Use the attribute attached to the loop if it exists otherwise use the - // global control. - if (auto attr = - forOp->getAttrOfType(mlir::triton::kNumStagesAttrName)) - return attr.getInt(); - return numStages; - } void runOnOperation() override { - SmallVector loops; - getOperation()->walk([&](scf::ForOp forOp) { - // Bail out for loops with num_stage <= 1. - if (getNumStagesOrDefault(forOp) > 1) - loops.push_back(forOp); + // Pre-processing + // we make sure element-wise ops are done *after* the conversion + // to dot operands + // we can achieve this with simple recursive pattern matching + // MLIRContext *context = &getContext(); + // mlir::RewritePatternSet patterns(context); + // patterns.add(context); + // auto didPreprocess = + // applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + + // Do the pipelining + getOperation()->walk([&](scf::ForOp forOp) -> void { + LoopPipeliner pipeliner(forOp); + + if (pipeliner.initialize().failed()) + return; + + pipeliner.emitPrologue(); + scf::ForOp pplForOp = pipeliner.createNewForOp(); + DenseMap newResults; + for (unsigned i = 0; i < forOp->getNumResults(); ++i) + newResults[forOp->getResult(i)] = pplForOp->getResult(i); + pipeliner.emitEpilogue(newResults); + + // Replace the original loop + for (auto &pair : newResults) + std::get<0>(pair).replaceAllUsesWith(std::get<1>(pair)); + forOp->erase(); }); - - if (loops.empty()) - return; - - bool pipelined = false; - for (scf::ForOp forOp : loops) { - auto outerLoop = dyn_cast(forOp->getParentOp()); - int loopNumStages = getNumStagesOrDefault(forOp); - pipelined |= pipelineLoop(forOp, loopNumStages); - } - - if (pipelined) { - // Clean up arithmetic before applying the next level of pipelining to - // simplify the IR. - auto arithDialect = - getOperation().getContext()->getLoadedDialect(); - RewritePatternSet patterns(getOperation().getContext()); - arithDialect->getCanonicalizationPatterns(patterns); - if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) - .failed()) - signalPassFailure(); - } } }; } // anonymous namespace -std::unique_ptr -mlir::createTritonAMDGPUStreamPipelinePass(int numStages) { - return std::make_unique(numStages); +std::unique_ptr mlir::createTritonAMDGPUStreamPipelinePass() { + return std::make_unique(); } diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp new file mode 100644 index 000000000000..6c438f6d1b80 --- /dev/null +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -0,0 +1,791 @@ +#include "TritonAMDGPUTransforms/Passes.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/SideEffectInterfaces.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "triton/Analysis/AxisInfo.h" +#include "triton/Analysis/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" +#include "triton/Dialect/TritonGPU/Transforms/Passes.h" +#include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" +#include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" +#include "triton/Dialect/TritonGPU/Transforms/Schedule.h" +#include "triton/Dialect/TritonGPU/Transforms/Utility.h" +#include "triton/Tools/Sys/GetEnv.hpp" +#include "llvm/Support/Debug.h" + +#include + +//===----------------------------------------------------------------------===// +// This file will create a schedule that will be handed over to the pipeline +// expander. +// Software pipeliners are usually separated into two pieces, one that create a +// modulo schedule and an expander that rewrites the loop and emits a prologue +// and epilogue. This pass first calls a helper that will pre-process the IR +// to create stream operations and create a modulo schedule. Then we call the +// expander to generate the prologue and new loop. +//===----------------------------------------------------------------------===// + +#define GEN_PASS_CLASSES +#include "TritonAMDGPUTransforms/Passes.h.inc" + +#define DEBUG_TYPE "tritonamdgpu-stream-pipeline-v2" +#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") + +#define int_attr(num) builder.getI64IntegerAttr(num) + +using namespace mlir; +namespace tt = mlir::triton; +namespace ttg = mlir::triton::gpu; + +namespace { + +struct LoadInfo { + // Layout of the data in the shared memory. + ttg::SharedEncodingAttr sharedEncoding = nullptr; + // Blocked encoding is used for loads not used by the dot. + ttg::BlockedEncodingAttr blockedEncoding = nullptr; + int distToUse = 0; + bool usedByDot = false; +}; + +} // namespace + +// Replace the ForOp's yield with a new one with the given operands appended. +static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { + // Fix up the yield op. + Operation *yieldOp = forOp.getBody()->getTerminator(); + SmallVector operands(yieldOp->getOperands()); + operands.append(newOperands.begin(), newOperands.end()); + + OpBuilder builder(yieldOp); + builder.create(yieldOp->getLoc(), operands); + yieldOp->erase(); +} + +static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, + Value insertIdx, Value extractIdx, + tt::CoarseSchedule &schedule, + tt::CoarseSchedule::Cluster prefetchCluster, + llvm::MapVector &loadToInfo, + int numStages) { + OpBuilder builder(forOp); + Value zero = builder.create(forOp.getLoc(), 0, 32); + // Replace the load with insert/extract slice. + builder.setInsertionPoint(loadOp); + Location loc = loadOp.getLoc(); + Value src = loadOp.getPtr(); + Value mask = loadOp.getMask(); + Value other = loadOp.getOther(); + if (!isExpensiveLoadOrStore(loadOp) && loadToInfo[loadOp].blockedEncoding) { + // For inexpensive loads that do not directly feed into dot ops + // we want to use optimal layout for the data. + ttg::BlockedEncodingAttr encoding = loadToInfo[loadOp].blockedEncoding; + auto convertBlockLayout = [&](Value src) { + auto ty = cast(src.getType()); + auto newTy = + RankedTensorType::get(ty.getShape(), ty.getElementType(), encoding); + auto cvt = + builder.create(loadOp->getLoc(), newTy, src); + return cvt.getResult(); + }; + src = convertBlockLayout(src); + if (mask) + mask = convertBlockLayout(mask); + if (other) + other = convertBlockLayout(other); + } + + tt::MemDescType allocTy = cast(alloc.getType()); + SmallVector copyOffsets(allocTy.getRank(), zero); + copyOffsets[0] = insertIdx; + Operation *copy = builder.clone(*loadOp); + + auto [stage, cluster] = schedule[loadOp]; + schedule.erase(loadOp); + schedule.insert(copy, stage, cluster); + + // Extract part. + SmallVector loadOffsets(allocTy.getRank(), zero); + loadOffsets[0] = extractIdx; + Attribute sharedMemorySpace = + triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + tt::MemDescType subviewTy = tt::MemDescType::get( + allocTy.getShape().drop_front(), allocTy.getElementType(), + allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); + auto viewLoad = + builder.create(loc, subviewTy, alloc, loadOffsets); + Operation *lds_store = + builder.create(loc, copy->getResult(0), viewLoad); + { + // Clean up old local caches. + SmallVector allocsToErase; + for (Operation *user : loadOp->getUsers()) { + if (auto alloc = dyn_cast(user)) { + alloc.replaceAllUsesWith(viewLoad.getResult()); + allocsToErase.push_back(alloc); + } + } + for (auto alloc : allocsToErase) { + alloc.erase(); + } + + auto sharedLoad = + builder.create(loc, loadOp.getType(), viewLoad); + auto result = sharedLoad->getResults(); + + // Create a select for non-zero other values. + Value other = loadOp.getOther(); + if (other && !isZeroConst(other)) { + auto select = builder.create( + loc, loadOp.getType(), mask, sharedLoad.getResult(), other); + result = select->getResults(); + } + + loadOp->replaceAllUsesWith(result); + + // Prefetch load if is used by the dot. + if (loadToInfo[loadOp].usedByDot) { + schedule.insert(lds_store, numStages - 2, prefetchCluster); + schedule.insert(viewLoad, numStages - 2, prefetchCluster); + } + } + loadOp.erase(); +} + +// If all the transitive uses of the given value have are used by a convert to +// the same dot operand encoding, return true and get the shared encoding that +// needs to be used to be compatible with users' layouts. +static std::optional +getSharedEncIfAllUsersAreDotEnc(Value val) { + ttg::SharedEncodingAttr attr; + for (Operation *user : val.getUsers()) { + ttg::SharedEncodingAttr tempAttr; + if (user->getNumResults() != 1) + return std::nullopt; + if (auto memDesc = + dyn_cast(user->getResult(0).getType())) { + // First time we find a shared encoding in the chain, save it and try to + // use it if it is compatible with the other users. + tempAttr = cast(memDesc.getEncoding()); + if (!getSharedEncIfAllUsersAreDotEnc(user->getResult(0)).has_value()) + return std::nullopt; + } else { + if (!isa(user)) + return std::nullopt; + auto dotOpEnc = dyn_cast( + cast(user->getResult(0).getType()).getEncoding()); + if (!dotOpEnc) + return std::nullopt; + auto srcTy = cast(val.getType()); + auto CTALayout = ttg::getCTALayout(srcTy.getEncoding()); + auto order = ttg::getOrder(srcTy.getEncoding()); + unsigned bitWidth = srcTy.getElementType().getIntOrFloatBitWidth(); + tempAttr = ttg::SharedEncodingAttr::get( + val.getContext(), dotOpEnc, srcTy.getShape(), + ttg::getOrder(srcTy.getEncoding()), + ttg::getCTALayout(srcTy.getEncoding()), + srcTy.getElementType().getIntOrFloatBitWidth(), /*needTrans=*/false); + } + // Check that the shared encodings needed by the users are compatible. + if (!tempAttr || (attr != nullptr && attr != tempAttr)) + return std::nullopt; + attr = tempAttr; + } + return attr; +} + +static ttg::BlockedEncodingAttr +getBlockedEncoding(tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfo) { + Value src = loadOp.getPtr(); + auto ty = cast(src.getType()); + auto mod = loadOp->getParentOfType(); + int numWarps = ttg::TritonGPUDialect::getNumWarps(mod); + int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod); + tt::AxisInfo::DimVectorT contiguity = + axisInfo.getAxisInfo(src)->getContiguity(); + SmallVector order = argSort(contiguity); + unsigned currPerThread = getNumElementsPerThread(loadOp, order, axisInfo); + SmallVector sizePerThread(order.size(), 1); + sizePerThread[order[0]] = currPerThread; + ttg::CTALayoutAttr ctaLayout = ttg::getCTALayout(ty.getEncoding()); + return ttg::BlockedEncodingAttr::get(loadOp->getContext(), ty.getShape(), + sizePerThread, order, numWarps, + threadsPerWarp, ctaLayout); +} + +// Create a map from load ops to their indirection level and the +// final use of the load op (another load op, or a dot op). +// Indirection level is "0" for the load op directly used by the dot op, +// "1" for the load op used by the load op used by the dot op, and so on. +static llvm::SmallVector> +loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { + llvm::SmallVector> + loadOpToIndLevelAndUse; + DenseSet seen; + + std::function dfs = + [&](Operation *op, int distance, Operation *use) { + if (!seen.insert(op).second) + return; + if (isa(op)) { + // TODO: What if there are multiple uses at different distances? + loadOpToIndLevelAndUse.push_back(std::make_tuple(op, distance, use)); + use = op; + distance++; + } + for (Value operand : op->getOperands()) { + Value v = operand; + Operation *defOp = v.getDefiningOp(); + if (defOp && defOp->getBlock() == op->getBlock()) { + dfs(defOp, distance, use); + } + } + }; + + for (Operation &op : forOp.getBody()->without_terminator()) { + if (!op.hasTrait()) + continue; + seen.clear(); + dfs(&op, 0, &op); + } + + // If the loop has numStages attribute, also consider pipelining other loads + // that are not directly used by dot ops. + if (forOp->hasAttr(tt::kNumStagesAttrName)) { + for (Operation &op : forOp.getBody()->without_terminator()) { + if (!isa(op)) + dfs(&op, 0, &op); + } + } + + return loadOpToIndLevelAndUse; +} + +static llvm::MapVector +assignMemoryLayouts(llvm::SmallVector> + &loadOpToIndLevelAndUse, + tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) { + llvm::MapVector loadToInfo; + + for (auto &[op, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(op)) + // TODO pawel: err, we'd need to verify that the distance is the same + continue; + LoadInfo loadInfo; + + auto loadOp = dyn_cast(op); + assert(!isLoadFromTensorPtr(loadOp) && + "Block ptr should have been lowered before this pass."); + auto ptr = loadOp.getPtr(); + unsigned vec = axisInfoAnalysis.getPtrContiguity(ptr); + if (auto mask = loadOp.getMask()) + vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); + + auto tensorTy = dyn_cast(ptr.getType()); + if (!tensorTy) + continue; + + auto ty = cast(tensorTy.getElementType()).getPointeeType(); + unsigned width = vec * ty.getIntOrFloatBitWidth(); + + // Limit shared memory sharing to width >= 32 elements. + LDBG("Load " << *loadOp << " has width " << width); + if (width < 32) + continue; + + if (use->hasTrait()) { + // Only use shared memory when feeding a dot op + loadInfo.usedByDot = true; + loadInfo.sharedEncoding = + getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr); + } else if (auto loadOp = dyn_cast(use)) { + // The use of this loadOp is another loadOp. If the use is not in the + // loadsToPipeline already, it means that the use is not valid for + // pipelining for some reason. We should skip this loadOp, too. Note that + // we have an assumption that distAndUse.second (i.e. the use of this + // loadOp) has already be processed in a previous loop iteration. This + // assumption is held by how loadOpsToIndirectionLevelAndUse recursively + // collects loadOpToIndLevelAndUse using DFS. + if (loadToInfo.count(loadOp) == 0) { + continue; + } + } + + // If we still don't have a shared encoding, try a "generic" shared + // encoding. + if (!loadInfo.sharedEncoding) { + // Also pipeline in-register buffers. + loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); + } + + loadToInfo[op] = loadInfo; + } + + return loadToInfo; +} + +static llvm::MapVector +scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, + DenseSet &rootUsers, int numStages) { + ModuleOp moduleOp = forOp->getParentOfType(); + tt::ModuleAxisInfoAnalysis axisInfoAnalysis(moduleOp); + + // Get all loads that are (transitively) used by dot ops and their distance + // to the dot op. + llvm::SmallVector> + loadOpToIndLevelAndUse = loadOpsToIndirectionLevelAndUse(forOp); + LLVM_DEBUG({ + LDBG("Found " << loadOpToIndLevelAndUse.size() << " loads to pipeline:"); + for (const auto &[l, i, u] : loadOpToIndLevelAndUse) { + LDBG(" - load: " << *l); + LDBG(" at indirection level: " << i); + LDBG(" used by op: " << *u); + } + }); + if (loadOpToIndLevelAndUse.empty()) + return {}; + + // Check which loads are good for pipelining, and assign them + // memory layouts. + llvm::MapVector loadToInfo = + assignMemoryLayouts(loadOpToIndLevelAndUse, axisInfoAnalysis); + + if (loadToInfo.empty()) + return {}; + + // Calculate the stage distance between applicable loads. + int maxIndirectionLevel = -1; + for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + maxIndirectionLevel = std::max(maxIndirectionLevel, dist); + } + unsigned stagesBetweenLoads = + ceil(numStages - 2, maxIndirectionLevel + 1); + + tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); + // Put the root uses of the loads in the last stage. + for (auto &[loadOp, dist, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + // Non-LoadOp(s) are the root uses of all LoadOp(s) and should be + // always present in the opInfo + if (!isa(use)) { + schedule.insert(use, numStages - 1, rootUsersCluster); + rootUsers.insert(use); + } + } + + SmallVector loadsClusters; + for (int i = 0; i < maxIndirectionLevel + 1; i++) { + loadsClusters.push_back(schedule.clusters.newAtBack()); + } + // Assign stages to the loads. + for (auto [loadOp, indLevel, _] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + int stage = (maxIndirectionLevel - indLevel) * stagesBetweenLoads; + schedule.insert(loadOp, stage, loadsClusters[indLevel]); + } + + // Distance from the load to the use. + for (auto [loadOp, _, use] : loadOpToIndLevelAndUse) { + if (loadToInfo.count(loadOp) == 0) + continue; + loadToInfo[loadOp].distToUse = schedule[use].first - schedule[loadOp].first; + } + + return loadToInfo; +} + +// Add dependencies of anchor ops to the coarse schedule. Schedule them to +// the same stage and ordering cluster as the anchor op. +static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, + int numStages) { + SmallVector> + opsInOrder = schedule.getOpsInOrder(forOp); + // Schedule dependencies stage by stage. + for (int stage = 0; stage < numStages; stage++) { + for (auto [op, stage_, cluster] : opsInOrder) { + if (stage_ != stage) + continue; + schedule.insertDepsOfOp(op, stage, cluster, false); + } + } +} + +// Find dependencies with distance of 1. They will go to the next stage, +// but in the cluster before the current op. +static void scheduleDistanceOneDependencies(scf::ForOp forOp, + tt::CoarseSchedule &schedule, + int numStages) { + auto getNestedOperands = [](Operation *op) -> SmallVector { + SmallVector operands; + op->walk([&](Operation *nestedOp) { + for (Value operand : nestedOp->getOperands()) { + if (operand.getParentBlock()->getParentOp()->isAncestor(nestedOp)) + operands.push_back(operand); + } + }); + return operands; + }; + + // Mapping from the cluster to the cluster before it. + DenseMap + dist1Cluster; + for (auto &op : forOp.getBody()->without_terminator()) { + if (schedule.count(&op) == 0) + continue; + auto [stage, cluster] = schedule[&op]; + // Can't schedule past the last stage. + if (stage == numStages - 1) + continue; + for (Value operand : getNestedOperands(&op)) { + if (auto arg = dyn_cast(operand)) { + if (arg.getArgNumber() > 0 && arg.getOwner() == op.getBlock()) { + auto yieldOp = op.getBlock()->getTerminator(); + Value v = yieldOp->getOperand(arg.getArgNumber() - 1); + Operation *defOp = v.getDefiningOp(); + if (defOp && schedule.count(defOp) == 0) { + if (isa(defOp)) { + // Exception: Schedule loads with a distance of 1 together + // with the current op. + schedule.insertIfAbsent(defOp, stage, cluster); + schedule.insertDepsOfOp(defOp, stage, cluster, true); + } else { + if (dist1Cluster.count(&cluster) == 0) { + dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster); + } + schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]); + schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster], + true); + } + } + } + } + } + } +} + +static void +scheduleRemainingToLastStage(scf::ForOp forOp, tt::CoarseSchedule &schedule, + tt::CoarseSchedule::Cluster afterPrologue, + int numStages) { + // Assign the rest of the ops to the last stage. + // Take care of the ordering of the ops - uses cannot be scheduled to the + // cluster before the definition. + DenseMap opToCluster; + for (auto &op : forOp.getBody()->without_terminator()) { + if (schedule.count(&op) == 0) { + opToCluster[&op] = afterPrologue; + } + } + SmallVector queue; + for (auto [op, stage, cluster] : schedule.getOpsInOrder(forOp)) { + // We really only care about the producers from the last stage. + // Others will be scheduled before these ops anyway. + if (stage == numStages - 1) { + queue.push_back(op); + } + } + while (!queue.empty()) { + Operation *op = queue.pop_back_val(); + for (auto user : op->getUsers()) { + if (opToCluster.count(user)) { + tt::CoarseSchedule::Cluster userCluster = opToCluster[user]; + tt::CoarseSchedule::Cluster opCluster = schedule[op].second; + if (*userCluster < *opCluster) { + opToCluster[user] = opCluster; + queue.push_back(user); + } + } + } + } + for (auto [op, cluster] : opToCluster) { + schedule.insert(op, numStages - 1, cluster); + } +} + +// Create an allocation that can hold distance number of loadOp shapes. +static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, + ttg::SharedEncodingAttr sharedEnc, unsigned distance) { + OpBuilder builder(forOp); + Attribute sharedMemorySpace = + triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); + auto ty = cast(loadOp->getResultTypes()[0]); + SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); + bufferShape.insert(bufferShape.begin(), distance); + Type memdescType = mlir::triton::MemDescType::get( + bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace, + /*mutableMemory*/ true); + Value alloc = builder.create( + loadOp->getLoc(), memdescType, Value()); + return alloc; +} + +// Convert load ops into their asyn version and apply multi-buffering based on +// the required number of buffers. +static SmallVector +createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, + llvm::MapVector &loadToInfo, + int numStages) { + // Calculate the number of buffers needed for each load. + // TODO pawel: we could do more fine-grained allocation here and + // allocate only the number of buffers that specific loads need. + // Instead, we allocate the maximum number of buffers needed by any load. + int numBuffers = + llvm::max_element(llvm::make_second_range(loadToInfo), [](auto &lhs, + auto &rhs) { + return lhs.distToUse < rhs.distToUse; + })->distToUse; + + SmallVector> asyncLoads; + SmallVector allocs; + for (auto &[loadOp, info] : loadToInfo) { + // assert(info.sharedEncoding && "LoadOp shared encoding not defined."); + if (info.sharedEncoding) { + Value alloc = createAlloc(forOp, loadOp, info.sharedEncoding, numBuffers); + assert(alloc && "Failed to create alloc for the async load."); + allocs.push_back(alloc); + asyncLoads.emplace_back(loadOp, alloc); + } + } + + IRRewriter builder(forOp.getContext()); + builder.setInsertionPoint(forOp); + + Location loc = forOp.getLoc(); + // Create two new counters to index into the allocs. + Value minusOne = builder.create(loc, -1, 32); + Value zero = builder.create(loc, 0, 32); + Value one = builder.create(loc, 1, 32); + Value insertIdx = minusOne; + Value extractIdx = minusOne; + Value phase = Value(); + Value numBuffersVal = + builder.create(loc, numBuffers, 32); + SmallVector newOperands; + newOperands.push_back(insertIdx); + newOperands.push_back(extractIdx); + + unsigned newOperandIndex = forOp.getBody()->getNumArguments(); + // Patch the loop to add the new loop carried dependencies. + scf::ForOp newForOp = + replaceForOpWithNewSignature(builder, forOp, newOperands); + forOp.erase(); + forOp = newForOp; + insertIdx = newForOp.getBody()->getArgument(newOperandIndex); + extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); + if (phase) { + phase = newForOp.getBody()->getArgument(newOperandIndex + 2); + } + + // Create two counters for the insert and extract indices to avoid creating + // long liverange. + builder.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); + insertIdx = builder.create(loc, insertIdx, one); + Value cndIns = builder.create(loc, arith::CmpIPredicate::slt, + insertIdx, numBuffersVal); + insertIdx = builder.create(loc, cndIns, insertIdx, zero); + + extractIdx = builder.create(loc, extractIdx, one); + Value cndExt = builder.create(loc, arith::CmpIPredicate::slt, + extractIdx, numBuffersVal); + extractIdx = builder.create(loc, cndExt, extractIdx, zero); + if (phase) { + Value nextPhase = builder.create(loc, phase, one); + phase = builder.create(loc, cndExt, phase, nextPhase); + } + + // Create a cluster for the prefetches. It may end up being empty, but this + // is OK. + tt::CoarseSchedule::Cluster prefetchCluster = schedule.clusters.newAtBack(); + + for (auto &pair : asyncLoads) { + if (auto loadOp = dyn_cast(pair.first)) { + createStreamCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, + schedule, prefetchCluster, loadToInfo, numStages); + } + } + SmallVector newYieldOperands = {insertIdx, extractIdx}; + if (phase) + newYieldOperands.push_back(phase); + // Patch the yield with the updated counters. + appendToYield(forOp, newYieldOperands); + + return allocs; +} + +static bool +preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, + mlir::triton::PipeliningOption &options) { + // Schedule the loads and root ops (dot ops) in the loop. This will give us + // a scaffold for the final schedule. + DenseSet rootUsers; + tt::CoarseSchedule coarseSchedule(numStages); + llvm::MapVector loadToInfo = + scheduleLoads(forOp, coarseSchedule, rootUsers, numStages); + if (loadToInfo.empty()) + return false; + + LLVM_DEBUG({ + LDBG("Coarse schedule loads only:"); + coarseSchedule.dump(); + }); + + // Convert the loads into async loads and create the allocs. + SmallVector allocs = + createStreamOps(forOp, coarseSchedule, loadToInfo, numStages); + + LLVM_DEBUG({ + LDBG("Coarse schedule with stream loads:"); + coarseSchedule.dump(); + }); + + tt::CoarseSchedule::Cluster afterPrologue = coarseSchedule.clusters.begin(); + + scheduleDependencies(forOp, coarseSchedule, numStages); + LLVM_DEBUG({ + LDBG("Coarse schedule with dependencies:"); + coarseSchedule.dump(); + }); + + scheduleDistanceOneDependencies(forOp, coarseSchedule, numStages); + LLVM_DEBUG({ + LDBG("Coarse schedule with dist 1:"); + coarseSchedule.dump(); + }); + + scheduleRemainingToLastStage(forOp, coarseSchedule, afterPrologue, numStages); + LLVM_DEBUG({ + LDBG("Final coarse schedule:"); + coarseSchedule.dump(); + }); + + // Create the final schedule for the kernel loop. This will dictate the + // stages and order of operations to the pipeline expander. + std::vector> schedule = + coarseSchedule.createFinalSchedule(forOp); + + // Fill out the pipeline options. + options.getScheduleFn = + [schedule](scf::ForOp forOp, + std::vector> &s) { + s = std::move(schedule); + }; + options.peelEpilogue = false; + options.predicateFn = tt::predicateOp; + options.supportDynamicLoops = true; + options.annotateFn = [](Operation *op, + mlir::triton::PipeliningOption::PipelinerPart part, + unsigned iteration) {}; + // Insert a wait 0 after the loop + OpBuilder builder(forOp); + builder.setInsertionPointAfter(forOp); + // Explicitly deallocate allocated tensors after the wait op + for (auto alloc : allocs) + builder.create(forOp.getLoc(), alloc); + return true; +} + +// Return true if the preconditions for pipelining the loop are met. +static bool preConditionInner(scf::ForOp forOp) { + // Skip loop with distance > 1 for now. + // TODO: relax the constraint in the expander. + if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), + [](Value operand) { + Operation *def = operand.getDefiningOp(); + return !def; + })) + return false; + // Don't pipeline outer loops. + if (forOp + ->walk([&](Operation *op) { + if (forOp.getOperation() == op) + return WalkResult::advance(); + if (isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }) + .wasInterrupted()) + return false; + return true; +} + +static bool pipelineLoop(scf::ForOp forOp, int numStages) { + mlir::triton::PipeliningOption options; + if (!preConditionInner(forOp)) + return false; + + bool foundSchedule = false; + foundSchedule = preProcessLoopAndGetSchedule2(forOp, numStages, options); + + // TODO: add more pipelines strategy. + if (!foundSchedule) + return false; + + IRRewriter rewriter(forOp->getContext()); + rewriter.setInsertionPoint(forOp); + FailureOr newForOp = + mlir::triton::pipelineForLoop(rewriter, forOp, options); + + if (failed(newForOp)) + return false; + return true; +} + +namespace { +struct PipelinePass : public TritonAMDGPUStreamPipelineV2Base { + PipelinePass() = default; + PipelinePass(int32_t numStages) { this->numStages = numStages; } + + int getNumStagesOrDefault(scf::ForOp forOp) { + // Use the attribute attached to the loop if it exists otherwise use the + // global control. + if (auto attr = + forOp->getAttrOfType(mlir::triton::kNumStagesAttrName)) + return attr.getInt(); + return numStages; + } + + void runOnOperation() override { + SmallVector loops; + getOperation()->walk([&](scf::ForOp forOp) { + // Bail out for loops with num_stage <= 1. + if (getNumStagesOrDefault(forOp) > 1) + loops.push_back(forOp); + }); + + if (loops.empty()) + return; + + bool pipelined = false; + for (scf::ForOp forOp : loops) { + auto outerLoop = dyn_cast(forOp->getParentOp()); + int loopNumStages = getNumStagesOrDefault(forOp); + pipelined |= pipelineLoop(forOp, loopNumStages); + } + + if (pipelined) { + // Clean up arithmetic before applying the next level of pipelining to + // simplify the IR. + auto arithDialect = + getOperation().getContext()->getLoadedDialect(); + RewritePatternSet patterns(getOperation().getContext()); + arithDialect->getCanonicalizationPatterns(patterns); + if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) + .failed()) + signalPassFailure(); + } + } +}; +} // anonymous namespace + +std::unique_ptr +mlir::createTritonAMDGPUStreamPipelineV2Pass(int numStages) { + return std::make_unique(numStages); +} diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc index c91a2992e7b2..9839b5dab7ad 100644 --- a/third_party/amd/python/triton_amd.cc +++ b/third_party/amd/python/triton_amd.cc @@ -55,8 +55,10 @@ void init_triton_amd_passes_ttgpuir(py::module &&m) { mlir::createTritonAMDGPUOptimizeEpiloguePass); ADD_PASS_WRAPPER_0("add_reorder_instructions", mlir::createTritonAMDGPUReorderInstructionsPass); - ADD_PASS_WRAPPER_1("add_stream_pipeline", - mlir::createTritonAMDGPUStreamPipelinePass, int); + ADD_PASS_WRAPPER_0("add_stream_pipeline", + mlir::createTritonAMDGPUStreamPipelinePass); + ADD_PASS_WRAPPER_1("add_stream_pipelinev2", + mlir::createTritonAMDGPUStreamPipelineV2Pass, int); } void addControlConstant(llvm::Module *module, const char *name, From 96c326ba09a546aa7eb43d71d46f85400863d61e Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 22 Jul 2024 19:10:51 +0000 Subject: [PATCH 13/36] * register new pass tritonamdgpu-stream-pipeline-v2 * update test --- bin/RegisterTritonDialects.h | 1 + test/TritonGPU/amd/amd-stream-pipeline.mlir | 26 ++++++++++----------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h index 1bd1db9496ea..e36710882afb 100644 --- a/bin/RegisterTritonDialects.h +++ b/bin/RegisterTritonDialects.h @@ -58,6 +58,7 @@ inline void registerTritonDialects(mlir::DialectRegistry ®istry) { mlir::registerTritonAMDGPUOptimizeEpilogue(); mlir::registerTritonAMDGPUReorderInstructions(); mlir::registerTritonAMDGPUStreamPipeline(); + mlir::registerTritonAMDGPUStreamPipelineV2(); // TODO: register Triton & TritonGPU passes registry.insert 128x128 @@ -750,9 +750,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> } @@ -804,9 +804,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> } @@ -981,9 +981,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> - %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> @@ -1234,9 +1234,9 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> - %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> - %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> + %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> From e4a89b31434263a35027a4fbc228bb33cfb68af8 Mon Sep 17 00:00:00 2001 From: SJW Date: Mon, 22 Jul 2024 19:34:58 +0000 Subject: [PATCH 14/36] * update tests --- .../amd/amd-reorder-instructions.mlir | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 97b52dfbb75f..9be5206bcdbc 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -924,8 +924,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %29 = tt.dot %27, %28, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %30 = arith.truncf %29 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %31 = triton_gpu.convert_layout %30 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %32 = tt.trans %arg7 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %33 = triton_gpu.local_load %32 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %32 = tt.trans %arg7 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %33 = triton_gpu.local_load %32 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %34 = tt.dot %31, %33, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> %35 = arith.addi %arg6, %c1_i32 : i32 %36 = arith.cmpi slt, %35, %c1_i32 : i32 @@ -1015,9 +1015,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %30 = tt.dot %28, %29, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> %31 = arith.truncf %30 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> %32 = triton_gpu.convert_layout %31 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %33 = triton_gpu.local_alloc %arg7 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %34 = tt.trans %33 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %35 = triton_gpu.local_load %34 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %33 = triton_gpu.local_alloc %arg7 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %34 = tt.trans %33 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %35 = triton_gpu.local_load %34 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> %36 = tt.dot %32, %35, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> scf.yield %30, %36, %24, %27, %21 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, tensor<64x16xf16, #blocked> } @@ -1292,8 +1292,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %82 = arith.cmpi slt, %81, %c1_i32 : i32 %83 = arith.select %82, %81, %c0_i32 : i32 %84 = triton_gpu.convert_layout %59 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %85 = tt.trans %arg10 {order = array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory> - %86 = triton_gpu.local_load %85 : !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %85 = tt.trans %arg10 {order = array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory, mutable> + %86 = triton_gpu.local_load %85 : !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %87 = tt.dot %84, %86, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> %88 = triton_gpu.convert_layout %87 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> %89 = triton_gpu.local_load %arg11 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> @@ -1682,9 +1682,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> %11 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %12 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> - %13 = tt.trans %12 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> - %14 = triton_gpu.local_load %13 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %12 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> + %13 = tt.trans %12 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> + %14 = triton_gpu.local_load %13 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> %16 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> triton_gpu.local_store %11, %16 : tensor<16x16xf32, #blocked> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> From ee989333740bd0098b36ff2fc92c18c680c269a3 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 23 Jul 2024 06:02:13 +0000 Subject: [PATCH 15/36] Swap to disable new pipeline by default --- third_party/amd/backend/compiler.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 713d5543925f..9f2d0fd91fe4 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -28,7 +28,7 @@ def min_dot_size(target: GPUTarget): class HIPOptions: num_warps: int = 4 waves_per_eu: int = 1 - num_stages: int = 2 + num_stages: int = 0 num_ctas: int = 1 extern_libs: dict = None cluster_dims: tuple = (1, 1, 1) @@ -149,17 +149,20 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_remove_layout_conversions(pm) amd.passes.ttgpuir.add_optimize_epilogue(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) + use_new_pipeliner = os.getenv("TRITON_HIP_USE_NEW_STREAM_PIPELINE", "0") == "1" if amd.has_matrix_core_feature(options.arch): - if os.getenv("TRITONAMD_OLD_STREAM_PIPELINE", "0") == "1": + if use_new_pipeliner: + num_stages = options.num_stages if options.num_stages != 0 else 2 + amd.passes.ttgpuir.add_stream_pipelinev2(pm, num_stages) + else: if options.num_stages == 0: amd.passes.ttgpuir.add_stream_pipeline(pm) - else: - amd.passes.ttgpuir.add_stream_pipelinev2(pm, options.num_stages) passes.common.add_canonicalizer(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) - amd.passes.ttgpuir.add_reorder_instructions(pm) + if use_new_pipeliner or options.num_stages != 0: + amd.passes.ttgpuir.add_reorder_instructions(pm) passes.common.add_cse(pm) passes.common.add_symbol_dce(pm) pm.run(mod) From c464a84039d76b6d8bfa90d17a41888313146926 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 23 Jul 2024 23:56:30 +0000 Subject: [PATCH 16/36] Drop unused header includes --- .../amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 6c438f6d1b80..bbeeaf837712 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -1,23 +1,16 @@ #include "TritonAMDGPUTransforms/Passes.h" -#include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/IRMapping.h" #include "mlir/IR/TypeUtilities.h" -#include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" -#include "triton/Dialect/TritonGPU/Transforms/Passes.h" #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h" #include "triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h" #include "triton/Dialect/TritonGPU/Transforms/Schedule.h" #include "triton/Dialect/TritonGPU/Transforms/Utility.h" -#include "triton/Tools/Sys/GetEnv.hpp" #include "llvm/Support/Debug.h" -#include - //===----------------------------------------------------------------------===// // This file will create a schedule that will be handed over to the pipeline // expander. From 1ceb6c61121adb73e42486be35eadcf35e88127e Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 24 Jul 2024 00:00:26 +0000 Subject: [PATCH 17/36] Drop changes to be exposed in future pull requests --- python/tutorials/03-matrix-multiplication.py | 10 +- .../amd/amd-reorder-instructions.mlir | 2281 ----------------- third_party/amd/backend/compiler.py | 12 +- .../ReorderInstructions.cpp | 127 +- 4 files changed, 25 insertions(+), 2405 deletions(-) diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py index 8153509055f9..91f751207b8e 100644 --- a/python/tutorials/03-matrix-multiplication.py +++ b/python/tutorials/03-matrix-multiplication.py @@ -206,19 +206,19 @@ def get_hip_autotune_config(): return [ triton.Config( {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, - num_warps=4, num_stages=2), + num_warps=4, num_stages=0), triton.Config( {'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 16, 'GROUP_SIZE_M': 4, 'waves_per_eu': 2}, - num_warps=8, num_stages=2), + num_warps=8, num_stages=0), triton.Config( {'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, 'waves_per_eu': 2}, - num_warps=8, num_stages=2), + num_warps=8, num_stages=0), triton.Config( {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'waves_per_eu': 3}, - num_warps=4, num_stages=2), + num_warps=4, num_stages=0), triton.Config( {'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 1, 'waves_per_eu': 8}, - num_warps=4, num_stages=2), + num_warps=4, num_stages=0), ] diff --git a/test/TritonGPU/amd/amd-reorder-instructions.mlir b/test/TritonGPU/amd/amd-reorder-instructions.mlir index 9be5206bcdbc..cb565d1f054d 100644 --- a/test/TritonGPU/amd/amd-reorder-instructions.mlir +++ b/test/TritonGPU/amd/amd-reorder-instructions.mlir @@ -23,2284 +23,3 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war tt.return } } - -// ----- -// Move loads (and independent local_stores) as early as possible. -// These tests are generated by Stream Pipelining tests from amd-stream-pipeline.mlir. -// For example in the matmul_loop below, the scf.for loop looks like this after pipeliner: -// scf.for ... { -// // stage 1 -// %a = tt.local_load %a_tile -// %b = tt.local_load %b_tile -// tt.dot %c, %a, %b -// // stage 0 -// %aptr = tt.addptr %aptr, %k -// %a_next = tt.load %aptr -// %bptr = tt.addptr %bptr, %k -// %b_next = tt.load %bptr -// tt.local_store %a_next -// tt.local_store %b_next -// yield -// } -// -// Should convert to : -// scf.for ... { -// // stage 0.a -// %aptr = tt.addptr %aptr, %k -// %a_next = tt.load %aptr -// %bptr = tt.addptr %bptr, %k -// %b_next = tt.load %bptr -// // stage 1 -// %a = tt.local_load %a_tile -// %b = tt.local_load %b_tile -// tt.dot %c, %a, %b -// // stage 0.b -// tt.local_store %a_next -// tt.local_store %b_next -// yield -// } - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -#shared3 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -#shared4 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, triton_gpu.target = "hip:gfx942"} { - -// CHECK-LABEL: tt.func @matmul_loop -// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}) -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[ADDPTR_23:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[LOAD_24:.*]] = tt.load %[[ADDPTR_23]], %[[SPLAT_22]] -// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// CHECK: %[[LOAD_27:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_25]], %{{.*}} -// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} -// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[MULF_33:.*]] = arith.mulf %[[LOCAL_LOAD_32]], %{{.*}} -// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_31]], %[[MULF_33]], %[[ARG8]] -// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} -// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_24]], %[[MEMDESC_SUBVIEW_38]] -// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: scf.yield %[[ADDPTR_23]], %[[ADDPTR_26]], %[[DOT_34]], %[[SELECT_30]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: } - - tt.func @matmul_loop(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { - %c1_i32 = arith.constant 1 : i32 - %0 = arith.cmpi slt, %arg0, %arg1 : index - %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> - %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> - %4 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> - %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> - %5 = tt.splat %0 : i1 -> tensor<32x128xi1, #blocked> - %6 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %7 = tt.load %6, %5, %cst : tensor<32x128x!tt.ptr, #blocked> - %8 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %9 = tt.expand_dims %8 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %10 = tt.broadcast %9 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> - %11 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> - %12 = tt.splat %0 : i1 -> tensor<128x32xi1, #blocked1> - %13 = tt.addptr %11, %10 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> - %14 = tt.load %13, %12 : tensor<128x32x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst_0 = arith.constant dense<4.000000e+00> : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %cst_1 = arith.constant dense<4> : tensor<32x128xi32, #blocked> - %cst_2 = arith.constant dense<4> : tensor<128x32xi32, #blocked1> - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %17 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %7, %18 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %19:7 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %13, %arg7 = %6, %arg8 = %cst_3, %arg9 = %c-1_i32, %arg10 = %c0_i32, %arg11 = %17, %arg12 = %18) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { - %20 = arith.subi %arg1, %arg2 : index - %21 = arith.cmpi slt, %arg5, %20 : index - %22 = tt.splat %21 : i1 -> tensor<32x128xi1, #blocked> - %23 = tt.addptr %arg7, %cst_1 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %24 = tt.load %23, %22, %cst : tensor<32x128x!tt.ptr, #blocked> - %25 = tt.splat %21 : i1 -> tensor<128x32xi1, #blocked1> - %26 = tt.addptr %arg6, %cst_2 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> - %27 = tt.load %26, %25 : tensor<128x32x!tt.ptr, #blocked1> - %28 = arith.addi %arg9, %c1_i32 : i32 - %29 = arith.cmpi slt, %28, %c1_i32 : i32 - %30 = arith.select %29, %28, %c0_i32 : i32 - %31 = triton_gpu.local_load %arg11 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %32 = triton_gpu.local_load %arg12 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %33 = arith.mulf %32, %cst_0 : tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %34 = tt.dot %31, %33, %arg8 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> - %35 = arith.addi %arg10, %c1_i32 : i32 - %36 = arith.cmpi slt, %35, %c1_i32 : i32 - %37 = arith.select %36, %35, %c0_i32 : i32 - %38 = triton_gpu.memdesc_subview %15[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %27, %38 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %39 = triton_gpu.memdesc_subview %16[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %24, %39 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %26, %23, %34, %30, %37, %38, %39 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %15 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %16 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - tt.return %19#2 : tensor<128x128xf32, #mma> - } - -// CHECK-LABEL: tt.func @matmul_loop_nested -// CHECK: %[[FOR_0:.*]] = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}) - -// CHECK: %[[SPLAT_1:.*]] = tt.splat %{{.*}} -// CHECK: %[[MAKE_RANGE_2:.*]] = tt.make_range {end = 32 : i32, start = 0 : i32} -// CHECK: %[[EXPAND_DIMS_3:.*]] = tt.expand_dims %[[MAKE_RANGE_2]] {axis = 0 : i32} -// CHECK: %[[CMPI_4:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[BROADCAST_5:.*]] = tt.broadcast %[[EXPAND_DIMS_3]] -// CHECK: %[[SPLAT_6:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[ADDPTR_7:.*]] = tt.addptr %[[SPLAT_1]], %[[BROADCAST_5]] -// CHECK: %[[LOAD_8:.*]] = tt.load %[[ADDPTR_7]], %[[SPLAT_6]], %{{.*}} -// CHECK: %[[MAKE_RANGE_9:.*]] = tt.make_range {end = 128 : i32, start = 0 : i32} -// CHECK: %[[EXPAND_DIMS_10:.*]] = tt.expand_dims %[[MAKE_RANGE_9]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_11:.*]] = tt.broadcast %[[EXPAND_DIMS_10]] -// CHECK: %[[SPLAT_12:.*]] = tt.splat %{{.*}} -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[ADDPTR_14:.*]] = tt.addptr %[[SPLAT_12]], %[[BROADCAST_11]] -// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]], %[[SPLAT_13]], %{{.*}} -// CHECK: %[[LOCAL_ALLOC_16:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_17:.*]] = triton_gpu.local_alloc -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_16]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_17]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_19]] -// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %[[ADDPTR_7]], %[[ARG9:.*]] = %[[ADDPTR_14]], %[[ARG10:.*]] = %[[ARG6]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) - -// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]], %{{.*}} -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_26]], %{{.*}} -// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} -// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_33:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_32]], %[[LOCAL_LOAD_33]], %[[ARG10]] -// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} -// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_16]][%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_38]] -// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_17]][%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: scf.yield %[[ADDPTR_24]], %[[ADDPTR_27]], %[[DOT_34]], %[[SELECT_31]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: } - -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_16]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_17]] -// CHECK: scf.yield %{{.*}}#2 -// CHECK: } - - tt.func @matmul_loop_nested(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { - %c1_i32 = arith.constant 1 : i32 - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<4> : tensor<32x128xi32, #blocked> - %cst_0 = arith.constant dense<4> : tensor<128x32xi32, #blocked1> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x32xf16, #blocked1> - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %0 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %cst_3) -> (tensor<128x128xf32, #mma>) { - %1 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> - %2 = arith.cmpi slt, %arg0, %arg1 : index - %3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %4 = tt.expand_dims %3 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> - %5 = tt.broadcast %4 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> - %6 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> - %7 = tt.splat %2 : i1 -> tensor<32x128xi1, #blocked> - %8 = tt.addptr %6, %5 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %9 = tt.load %8, %7, %cst_1 : tensor<32x128x!tt.ptr, #blocked> - %10 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %11 = tt.expand_dims %10 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %12 = tt.broadcast %11 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> - %13 = tt.splat %2 : i1 -> tensor<128x32xi1, #blocked1> - %14 = tt.addptr %1, %12 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> - %15 = tt.load %14, %13, %cst_2 : tensor<128x32x!tt.ptr, #blocked1> - %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %15, %18 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %19 = triton_gpu.memdesc_subview %17[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %9, %19 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %20:7 = scf.for %arg7 = %arg0 to %arg1 step %arg2 iter_args(%arg8 = %14, %arg9 = %8, %arg10 = %arg6, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %18, %arg14 = %19) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { - %21 = arith.subi %arg1, %arg2 : index - %22 = arith.cmpi slt, %arg7, %21 : index - %23 = tt.splat %22 : i1 -> tensor<32x128xi1, #blocked> - %24 = tt.addptr %arg9, %cst : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %25 = tt.load %24, %23, %cst_1 : tensor<32x128x!tt.ptr, #blocked> - %26 = tt.splat %22 : i1 -> tensor<128x32xi1, #blocked1> - %27 = tt.addptr %arg8, %cst_0 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> - %28 = tt.load %27, %26, %cst_2 : tensor<128x32x!tt.ptr, #blocked1> - %29 = arith.addi %arg11, %c1_i32 : i32 - %30 = arith.cmpi slt, %29, %c1_i32 : i32 - %31 = arith.select %30, %29, %c0_i32 : i32 - %32 = triton_gpu.local_load %arg13 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %33 = triton_gpu.local_load %arg14 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %34 = tt.dot %32, %33, %arg10 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> - %35 = arith.addi %arg12, %c1_i32 : i32 - %36 = arith.cmpi slt, %35, %c1_i32 : i32 - %37 = arith.select %36, %35, %c0_i32 : i32 - %38 = triton_gpu.memdesc_subview %16[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %28, %38 : tensor<128x32xf16, #blocked1> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %39 = triton_gpu.memdesc_subview %17[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %25, %39 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %27, %24, %34, %31, %37, %38, %39 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %16 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %17 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %20#2 : tensor<128x128xf32, #mma> - } - tt.return %0 : tensor<128x128xf32, #mma> - } - -// CHECK-LABEL: tt.func @matmul_loop_single_pipeline -// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_17]] -// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[SPLAT_19]], %{{.*}} -// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG10]] -// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[DOT_27:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[LOCAL_LOAD_25]], %[[ARG7]] -// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} -// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_30]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_21]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: scf.yield %[[ADDPTR_20]], %[[DOT_27]], %[[SELECT_24]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: } - - tt.func @matmul_loop_single_pipeline(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { - %c1_i32 = arith.constant 1 : i32 - %0 = arith.cmpi slt, %arg0, %arg1 : index - %1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x128xi32, #blocked> - %3 = tt.broadcast %2 : tensor<1x128xi32, #blocked> -> tensor<32x128xi32, #blocked> - %4 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> - %cst = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> - %5 = tt.splat %0 : i1 -> tensor<32x128xi1, #blocked> - %6 = tt.addptr %4, %3 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %7 = tt.load %6, %5, %cst : tensor<32x128x!tt.ptr, #blocked> - %8 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %9 = tt.expand_dims %8 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %10 = tt.broadcast %9 : tensor<1x32xi32, #blocked1> -> tensor<128x32xi32, #blocked1> - %11 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> - %12 = tt.addptr %11, %10 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi32, #blocked1> - %13 = tt.load %12 : tensor<128x32x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst_0 = arith.constant dense<4> : tensor<32x128xi32, #blocked> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %15 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %7, %15 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - %16:5 = scf.for %arg5 = %arg0 to %arg1 step %arg2 iter_args(%arg6 = %6, %arg7 = %cst_1, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %15) -> (tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable>) { - %17 = arith.subi %arg1, %arg2 : index - %18 = arith.cmpi slt, %arg5, %17 : index - %19 = tt.splat %18 : i1 -> tensor<32x128xi1, #blocked> - %20 = tt.addptr %arg6, %cst_0 : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi32, #blocked> - %21 = tt.load %20, %19, %cst : tensor<32x128x!tt.ptr, #blocked> - %22 = arith.addi %arg8, %c1_i32 : i32 - %23 = arith.cmpi slt, %22, %c1_i32 : i32 - %24 = arith.select %23, %22, %c0_i32 : i32 - %25 = triton_gpu.local_load %arg10 : !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %26 = triton_gpu.convert_layout %13 : tensor<128x32xf16, #blocked1> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %27 = tt.dot %26, %25, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> - %28 = arith.addi %arg9, %c1_i32 : i32 - %29 = arith.cmpi slt, %28, %c1_i32 : i32 - %30 = arith.select %29, %28, %c0_i32 : i32 - %31 = triton_gpu.memdesc_subview %14[%30, %c0_i32, %c0_i32] : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %21, %31 : tensor<32x128xf16, #blocked> -> !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %20, %27, %24, %30, %31 : tensor<32x128x!tt.ptr, #blocked>, tensor<128x128xf32, #mma>, i32, i32, !tt.memdesc<32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x128xf16, #shared1, #triton_gpu.shared_memory, mutable> - tt.return %16#1 : tensor<128x128xf32, #mma> - } - -// This example tests that tt.load overlaps with independent ttg.local_store which -// overlaps with independent tt.dot. - -// CHECK-LABEL: tt.func @indirect_bmm_scalar -// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}, %[[ARG15:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[ADDI_26:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_27:.*]] = arith.cmpi slt, %[[ADDI_26]], %{{.*}} -// CHECK: %[[SELECT_28:.*]] = arith.select %[[CMPI_27]], %[[ADDI_26]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_29:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_28]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_29]] -// CHECK: %[[MEMDESC_SUBVIEW_30:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_28]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_30]] -// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[CMPI_31]] -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_32]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[CMPI_31]] -// CHECK: %[[MULI_37:.*]] = arith.muli %{{.*}}, %[[LOAD_36]] -// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[MULI_37]] -// CHECK: %[[SPLAT_39:.*]] = tt.splat %[[CMPI_31]] -// CHECK: %[[ADDPTR_40:.*]] = tt.addptr %{{.*}}, %[[SPLAT_38]] -// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_40]], %[[SPLAT_39]] -// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} -// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_45:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_46:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_45]], %[[LOCAL_LOAD_46]], %[[ARG7]] -// CHECK: scf.yield %[[DOT_47]], %[[ADDPTR_33]], %[[ADDPTR_35]], %[[SELECT_44]], %[[SELECT_28]], %[[MEMDESC_SUBVIEW_29]], %[[MEMDESC_SUBVIEW_30]], %[[LOAD_34]], %[[LOAD_41]] -// CHECK: } - - tt.func @indirect_bmm_scalar(%arg0: i64 {tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: !tt.ptr, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %0 = arith.cmpi sgt, %arg1, %c1 : index - %c1_i32 = arith.constant 1 : i32 - %1 = tt.addptr %arg3, %c1_i32 : !tt.ptr, i32 - %2 = tt.load %1, %0 : !tt.ptr - %3 = arith.muli %arg0, %2 : i64 - %4 = tt.splat %3 : i64 -> tensor<16x16xi64, #blocked> - %5 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked> - %6 = tt.addptr %arg5, %4 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %7 = tt.load %6, %5 : tensor<16x16x!tt.ptr, #blocked> - %8 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked1> - %9 = tt.addptr %arg2, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %10 = tt.load %9, %8 : tensor<16x16x!tt.ptr, #blocked1> - %c0 = arith.constant 0 : index - %11 = arith.cmpi sgt, %arg1, %c0 : index - %12 = tt.load %arg3, %11 : !tt.ptr - %13 = arith.muli %arg0, %12 : i64 - %14 = tt.splat %13 : i64 -> tensor<16x16xi64, #blocked> - %15 = tt.splat %11 : i1 -> tensor<16x16xi1, #blocked> - %16 = tt.addptr %arg5, %14 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %17 = tt.load %16, %15 : tensor<16x16x!tt.ptr, #blocked> - %18 = tt.splat %11 : i1 -> tensor<16x16xi1, #blocked1> - %19 = tt.load %arg2, %18 : tensor<16x16x!tt.ptr, #blocked1> - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %20 = triton_gpu.local_alloc : () -> !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %21 = triton_gpu.local_alloc : () -> !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %22 = triton_gpu.memdesc_subview %20[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %19, %22 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %23 = triton_gpu.memdesc_subview %21[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %17, %23 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %24:9 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %9, %arg9 = %1, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %22, %arg13 = %23, %arg14 = %10, %arg15 = %7) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16x16xf16, #blocked1>, tensor<16x16xf16, #blocked>) { - %25 = arith.subi %arg1, %c2 : index - %26 = arith.cmpi slt, %arg6, %25 : index - %27 = tt.addptr %arg9, %c1_i32 : !tt.ptr, i32 - %28 = tt.load %27, %26 : !tt.ptr - %29 = arith.muli %arg0, %28 : i64 - %30 = tt.splat %29 : i64 -> tensor<16x16xi64, #blocked> - %31 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> - %32 = tt.addptr %arg5, %30 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %33 = tt.load %32, %31 : tensor<16x16x!tt.ptr, #blocked> - %34 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> - %35 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %36 = tt.load %35, %34 : tensor<16x16x!tt.ptr, #blocked1> - %37 = arith.addi %arg11, %c1_i32 : i32 - %38 = arith.cmpi slt, %37, %c2_i32 : i32 - %39 = arith.select %38, %37, %c0_i32 : i32 - %40 = triton_gpu.memdesc_subview %21[%39, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg15, %40 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %41 = triton_gpu.memdesc_subview %20[%39, %c0_i32, %c0_i32] : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %arg14, %41 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %42 = arith.addi %arg10, %c1_i32 : i32 - %43 = arith.cmpi slt, %42, %c2_i32 : i32 - %44 = arith.select %43, %42, %c0_i32 : i32 - %45 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %46 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %47 = tt.dot %45, %46, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> - scf.yield %47, %35, %27, %44, %39, %41, %40, %36, %33 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16x16xf16, #blocked1>, tensor<16x16xf16, #blocked> - } - triton_gpu.local_dealloc %20 : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %21 : !tt.memdesc<2x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - tt.return %24#0 : tensor<16x16xf32, #mma> - } - -// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[SPLAT_19]] -// CHECK: %[[LOAD_22:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] -// CHECK: %[[MULI_23:.*]] = arith.muli %{{.*}}, %[[ARG10]] -// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[MULI_23]] -// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[SPLAT_24]] -// CHECK: %[[LOAD_27:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_25]] -// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} -// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_32:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[DOT_33:.*]] = tt.dot %[[LOCAL_LOAD_31]], %[[LOCAL_LOAD_32]], %[[ARG7]] -// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} -// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_21]], %[[MEMDESC_SUBVIEW_38]] -// CHECK: %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: scf.yield %[[DOT_33]], %[[ADDPTR_20]], %[[ADDPTR_34]], %[[LOAD_22]], %[[SELECT_30]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]], %[[MEMDESC_SUBVIEW_39]] -// CHECK: } - - tt.func @indirect_bmm_scalar_dist_one(%arg0: i64 {tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: !tt.ptr, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { - %c0_i32 = arith.constant 0 : i32 - %c0 = arith.constant 0 : index - %0 = arith.cmpi sgt, %arg1, %c0 : index - %1 = tt.load %arg3 : !tt.ptr - %2 = arith.muli %arg0, %1 : i64 - %3 = tt.splat %2 : i64 -> tensor<16x16xi64, #blocked> - %4 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked> - %5 = tt.addptr %arg5, %3 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %6 = tt.load %5, %4 : tensor<16x16x!tt.ptr, #blocked> - %c1_i32 = arith.constant 1 : i32 - %7 = tt.addptr %arg3, %c1_i32 : !tt.ptr, i32 - %8 = tt.load %7, %0 : !tt.ptr - %9 = tt.splat %0 : i1 -> tensor<16x16xi1, #blocked1> - %10 = tt.load %arg2, %9 : tensor<16x16x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1 = arith.constant 1 : index - %11 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %12 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %13 = tt.addptr %7, %c1_i32 : !tt.ptr, i32 - %14 = triton_gpu.memdesc_subview %11[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %10, %14 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %15 = triton_gpu.memdesc_subview %12[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %6, %15 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %16:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst, %arg8 = %arg2, %arg9 = %13, %arg10 = %8, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %14, %arg14 = %15) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i64, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>) { - %17 = arith.subi %arg1, %c1 : index - %18 = arith.cmpi slt, %arg6, %17 : index - %19 = arith.muli %arg0, %arg10 : i64 - %20 = tt.splat %19 : i64 -> tensor<16x16xi64, #blocked> - %21 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked> - %22 = tt.addptr %arg5, %20 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %23 = tt.load %22, %21 : tensor<16x16x!tt.ptr, #blocked> - %24 = tt.load %arg9, %18 : !tt.ptr - %25 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked1> - %26 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %27 = tt.load %26, %25 : tensor<16x16x!tt.ptr, #blocked1> - %28 = arith.addi %arg11, %c1_i32 : i32 - %29 = arith.cmpi slt, %28, %c1_i32 : i32 - %30 = arith.select %29, %28, %c0_i32 : i32 - %31 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %32 = triton_gpu.local_load %arg14 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %33 = tt.dot %31, %32, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> - %34 = tt.addptr %arg9, %c1_i32 : !tt.ptr, i32 - %35 = arith.addi %arg12, %c1_i32 : i32 - %36 = arith.cmpi slt, %35, %c1_i32 : i32 - %37 = arith.select %36, %35, %c0_i32 : i32 - %38 = triton_gpu.memdesc_subview %11[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %27, %38 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %39 = triton_gpu.memdesc_subview %12[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %23, %39 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - scf.yield %33, %26, %34, %24, %30, %37, %38, %39 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, !tt.ptr, i64, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %11 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %12 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - tt.return %16#0 : tensor<16x16xf32, #mma> - } - -// CHECK-LABEL: tt.func @indirect_bmm_vector -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_21]] -// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]] -// CHECK: %[[EXPAND_DIMS_26:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[EXPAND_DIMS_26]] -// CHECK: %[[MULI_28:.*]] = arith.muli %{{.*}}, %[[BROADCAST_27]] -// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[MULI_28]] -// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[SPLAT_29]] -// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_32]] -// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] -// CHECK: %[[ADDI_36:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_37:.*]] = arith.cmpi slt, %[[ADDI_36]], %{{.*}} -// CHECK: %[[SELECT_38:.*]] = arith.select %[[CMPI_37]], %[[ADDI_36]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_39:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_40:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_39]], %[[LOCAL_LOAD_40]], %[[ARG7]] -// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} -// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_45:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_45]] -// CHECK: %[[MEMDESC_SUBVIEW_46:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_31]], %[[MEMDESC_SUBVIEW_46]] -// CHECK: scf.yield %[[DOT_41]], %[[ADDPTR_24]], %[[ADDPTR_34]], %[[SELECT_38]], %[[SELECT_44]], %[[MEMDESC_SUBVIEW_45]], %[[MEMDESC_SUBVIEW_46]], %[[LOAD_35]] -// CHECK: } - - tt.func @indirect_bmm_vector(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %0 = arith.cmpi sgt, %arg1, %c1 : index - %cst = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.splat %0 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.addptr %arg3, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %3 = tt.load %2, %1 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %c0 = arith.constant 0 : index - %4 = arith.cmpi sgt, %arg1, %c0 : index - %5 = tt.splat %4 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = tt.load %arg3, %5 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %8 = tt.broadcast %7 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %9 = arith.muli %arg0, %8 : tensor<16x16xi64, #blocked> - %10 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked> - %11 = tt.addptr %arg5, %9 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %12 = tt.load %11, %10 : tensor<16x16x!tt.ptr, #blocked> - %13 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked1> - %14 = tt.load %arg2, %13 : tensor<16x16x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %12, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %19:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_0, %arg8 = %arg2, %arg9 = %2, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %17, %arg13 = %18, %arg14 = %3) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { - %20 = arith.subi %arg1, %c2 : index - %21 = arith.cmpi slt, %arg6, %20 : index - %22 = tt.splat %21 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %23 = tt.addptr %arg9, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %24 = tt.load %23, %22 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %25 = arith.subi %arg1, %c1 : index - %26 = arith.cmpi slt, %arg6, %25 : index - %27 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %28 = tt.broadcast %27 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %29 = arith.muli %arg0, %28 : tensor<16x16xi64, #blocked> - %30 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> - %31 = tt.addptr %arg5, %29 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %32 = tt.load %31, %30 : tensor<16x16x!tt.ptr, #blocked> - %33 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> - %34 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %35 = tt.load %34, %33 : tensor<16x16x!tt.ptr, #blocked1> - %36 = arith.addi %arg10, %c1_i32 : i32 - %37 = arith.cmpi slt, %36, %c1_i32 : i32 - %38 = arith.select %37, %36, %c0_i32 : i32 - %39 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %40 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %41 = tt.dot %39, %40, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> - %42 = arith.addi %arg11, %c1_i32 : i32 - %43 = arith.cmpi slt, %42, %c1_i32 : i32 - %44 = arith.select %43, %42, %c0_i32 : i32 - %45 = triton_gpu.memdesc_subview %15[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %35, %45 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - %46 = triton_gpu.memdesc_subview %16[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %32, %46 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - scf.yield %41, %34, %23, %38, %44, %45, %46, %24 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared2, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } - triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %16 : !tt.memdesc<1x16x16xf16, #shared2, #triton_gpu.shared_memory, mutable> - tt.return %19#0 : tensor<16x16xf32, #mma> - } - -// CHECK-LABEL: tt.func @post_load_inv -// CHECK: %{{.*}}:5 = scf.for %[[ARG9:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG9]], %{{.*}} -// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[INDEX_CAST_21:.*]] = arith.index_cast %[[ADDI_20]] -// CHECK: %[[MULI_22:.*]] = arith.muli %[[INDEX_CAST_21]], %{{.*}} -// CHECK: %[[SUBI_23:.*]] = arith.subi %{{.*}}, %[[MULI_22]] -// CHECK: %[[INDEX_CAST_24:.*]] = arith.index_cast %[[ARG9]] -// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[SUBI_23]] -// CHECK: %[[ADDI_26:.*]] = arith.addi %[[INDEX_CAST_24]], %{{.*}} -// CHECK: %[[CMPI_27:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_25]] -// CHECK: %[[MULI_28:.*]] = arith.muli %[[ADDI_26]], %{{.*}} -// CHECK: %[[BROADCAST_29:.*]] = tt.broadcast %[[CMPI_27]] -// CHECK: %[[SPLAT_30:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[SPLAT_31:.*]] = tt.splat %[[MULI_28]] -// CHECK: %[[ANDI_32:.*]] = arith.andi %[[SPLAT_30]], %[[BROADCAST_29]] -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_31]] -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_33]], %[[ANDI_32]], %{{.*}} -// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[SUBI_23]] -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_35]] -// CHECK: %[[MULI_37:.*]] = arith.muli %[[MULI_28]], %{{.*}} -// CHECK: %[[BROADCAST_38:.*]] = tt.broadcast %[[CMPI_36]] -// CHECK: %[[SPLAT_39:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[MULI_37]] -// CHECK: %[[ANDI_41:.*]] = arith.andi %[[SPLAT_39]], %[[BROADCAST_38]] -// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_40]] -// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_42]], %[[ANDI_41]], %{{.*}} -// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_47:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_48:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[DOT_49:.*]] = tt.dot %[[LOCAL_LOAD_47]], %[[LOCAL_LOAD_48]], %[[ARG10]] -// CHECK: %[[ADDI_50:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_51:.*]] = arith.cmpi slt, %[[ADDI_50]], %{{.*}} -// CHECK: %[[SELECT_52:.*]] = arith.select %[[CMPI_51]], %[[ADDI_50]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_53:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_52]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_53]] -// CHECK: %[[MEMDESC_SUBVIEW_54:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_52]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_43]], %[[MEMDESC_SUBVIEW_54]] -// CHECK: scf.yield %[[DOT_49]], %[[SELECT_46]], %[[SELECT_52]], %[[MEMDESC_SUBVIEW_53]], %[[MEMDESC_SUBVIEW_54]] -// CHECK: } - - tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #mma> { - %c899 = arith.constant 899 : index - %0 = tt.splat %arg5 : i32 -> tensor<32x1xi32, #blocked1> - %1 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked1> - %2 = arith.cmpi slt, %1, %0 : tensor<32x1xi32, #blocked1> - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked1> - %3 = tt.broadcast %2 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %5 = tt.load %4, %3, %cst : tensor<32x32x!tt.ptr, #blocked1> - %6 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> - %7 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #blocked1> - %8 = arith.cmpi slt, %7, %6 : tensor<1x32xi32, #blocked1> - %9 = tt.broadcast %8 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %10 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %11 = tt.load %10, %9, %cst : tensor<32x32x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %c900 = arith.constant 900 : index - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %12 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %13 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - %16 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %11, %16 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %5, %17 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - %18:5 = scf.for %arg9 = %c0 to %c900 step %c1 iter_args(%arg10 = %cst_0, %arg11 = %c-1_i32, %arg12 = %c0_i32, %arg13 = %16, %arg14 = %17) -> (tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable>) { - %19 = arith.cmpi slt, %arg9, %c899 : index - %20 = arith.addi %arg9, %c1 : index - %21 = arith.index_cast %20 : index to i32 - %22 = arith.muli %21, %c32_i32 : i32 - %23 = arith.subi %arg5, %22 : i32 - %24 = tt.splat %23 : i32 -> tensor<32x1xi32, #blocked1> - %25 = arith.cmpi slt, %1, %24 : tensor<32x1xi32, #blocked1> - %26 = tt.broadcast %25 : tensor<32x1xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %27 = tt.splat %19 : i1 -> tensor<32x32xi1, #blocked1> - %28 = arith.index_cast %arg9 : index to i32 - %29 = arith.addi %28, %c1_i32 : i32 - %30 = arith.muli %29, %c32_i32 : i32 - %31 = arith.muli %30, %arg7 : i32 - %32 = tt.splat %31 : i32 -> tensor<32x32xi32, #blocked1> - %33 = arith.andi %27, %26 : tensor<32x32xi1, #blocked1> - %34 = tt.addptr %13, %32 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> - %35 = tt.load %34, %33, %cst : tensor<32x32x!tt.ptr, #blocked1> - %36 = tt.splat %23 : i32 -> tensor<1x32xi32, #blocked1> - %37 = arith.cmpi slt, %7, %36 : tensor<1x32xi32, #blocked1> - %38 = tt.broadcast %37 : tensor<1x32xi1, #blocked1> -> tensor<32x32xi1, #blocked1> - %39 = tt.splat %19 : i1 -> tensor<32x32xi1, #blocked1> - %40 = tt.splat %30 : i32 -> tensor<32x32xi32, #blocked1> - %41 = arith.andi %39, %38 : tensor<32x32xi1, #blocked1> - %42 = tt.addptr %12, %40 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi32, #blocked1> - %43 = tt.load %42, %41, %cst : tensor<32x32x!tt.ptr, #blocked1> - %44 = arith.addi %arg11, %c1_i32 : i32 - %45 = arith.cmpi slt, %44, %c1_i32 : i32 - %46 = arith.select %45, %44, %c0_i32 : i32 - %47 = triton_gpu.local_load %arg13 : !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %48 = triton_gpu.local_load %arg14 : !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %49 = tt.dot %47, %48, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %50 = arith.addi %arg12, %c1_i32 : i32 - %51 = arith.cmpi slt, %50, %c1_i32 : i32 - %52 = arith.select %51, %50, %c0_i32 : i32 - %53 = triton_gpu.memdesc_subview %14[%52, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %43, %53 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - %54 = triton_gpu.memdesc_subview %15[%52, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %35, %54 : tensor<32x32xf32, #blocked1> -> !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - scf.yield %49, %46, %52, %53, %54 : tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared3, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x32xf32, #shared3, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %15 : !tt.memdesc<1x32x32xf32, #shared4, #triton_gpu.shared_memory, mutable> - tt.return %18#0 : tensor<32x32xf32, #mma> - } - -// CHECK-LABEL: tt.func @dep_arg_two_uses -// CHECK: %{{.*}}:5 = scf.for %[[ARG3:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_8:.*]] = arith.subi %{{.*}}, %[[ARG3]] -// CHECK: %[[INDEX_CAST_9:.*]] = arith.index_cast %[[SUBI_8]] -// CHECK: %[[EXPAND_DIMS_10:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} -// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[INDEX_CAST_9]] -// CHECK: %[[EXTSI_12:.*]] = arith.extsi %[[EXPAND_DIMS_10]] -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_11]] -// CHECK: %[[MULI_14:.*]] = arith.muli %[[EXTSI_12]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_15:.*]] = tt.expand_dims %[[CMPI_13]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_16:.*]] = tt.broadcast %[[MULI_14]] -// CHECK: %[[BROADCAST_17:.*]] = tt.broadcast %[[EXPAND_DIMS_15]] -// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %[[ARG4]], %[[BROADCAST_16]] -// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[BROADCAST_17]] -// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[ARG6]] -// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %[[SPLAT_20]], %{{.*}} -// CHECK: %[[LOAD_22:.*]] = tt.load %[[ADDPTR_21]] -// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[INDEX_CAST_9]] -// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_23]] -// CHECK: %[[EXPAND_DIMS_25:.*]] = tt.expand_dims %[[CMPI_24]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_26:.*]] = tt.broadcast %[[EXPAND_DIMS_25]] -// CHECK: %[[LOAD_27:.*]] = tt.load %[[ARG8]], %[[BROADCAST_26]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_28:.*]] = tt.expand_dims %[[ARG5]] {axis = 0 : i32} -// CHECK: %[[EXTSI_29:.*]] = arith.extsi %[[EXPAND_DIMS_28]] -// CHECK: %[[MULI_30:.*]] = arith.muli %[[EXTSI_29]], %{{.*}} -// CHECK: %[[BROADCAST_31:.*]] = tt.broadcast %[[MULI_30]] -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG4]], %[[BROADCAST_31]] -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_34:.*]] = triton_gpu.convert_layout %[[LOAD_19]] -// CHECK: %[[CONVERT_LAYOUT_35:.*]] = triton_gpu.convert_layout %[[LOAD_27]] -// CHECK: %[[DOT_36:.*]] = tt.dot %[[CONVERT_LAYOUT_34]], %[[CONVERT_LAYOUT_35]], %[[ARG7]] -// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: scf.yield %[[ADDPTR_32]], %[[LOAD_22]], %[[ADDPTR_33]], %[[DOT_36]], %[[ADDPTR_37]] -// CHECK: } - - tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #mma> { - %cst = arith.constant dense<64> : tensor<32x128xi64, #blocked> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x128xf16, #blocked> - %c32_i32 = arith.constant 32 : i32 - %cst_1 = arith.constant dense<64> : tensor<1x32xi64, #blocked1> - %c0 = arith.constant 0 : index - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %c32 = arith.constant 32 : index - %c100 = arith.constant 100 : index - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %2 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %3 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %4 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked1> - %5 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked> - %6 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 - %7:5 = scf.for %arg3 = %c0 to %c100 step %c32 iter_args(%arg4 = %4, %arg5 = %3, %arg6 = %6, %arg7 = %cst_2, %arg8 = %5) -> (tensor<128x32x!tt.ptr, #blocked1>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, !tt.ptr, tensor<128x128xf32, #mma>, tensor<32x128x!tt.ptr, #blocked>) { - %8 = arith.subi %c100, %arg3 : index - %9 = arith.index_cast %8 : index to i32 - %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %11 = arith.cmpi slt, %2, %10 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = tt.expand_dims %11 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi1, #blocked> - %13 = tt.broadcast %12 : tensor<32x1xi1, #blocked> -> tensor<32x128xi1, #blocked> - %14 = tt.load %arg8, %13, %cst_0 : tensor<32x128x!tt.ptr, #blocked> - %15 = tt.splat %arg6 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %16 = tt.addptr %15, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %18 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %19 = arith.cmpi slt, %1, %18 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.expand_dims %19 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi1, #blocked1> - %21 = tt.expand_dims %arg5 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %22 = arith.extsi %21 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %23 = arith.muli %22, %cst_1 : tensor<1x32xi64, #blocked1> - %24 = tt.broadcast %23 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> - %25 = tt.broadcast %20 : tensor<1x32xi1, #blocked1> -> tensor<128x32xi1, #blocked1> - %26 = tt.addptr %arg4, %24 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi64, #blocked1> - %27 = tt.load %26, %25 : tensor<128x32x!tt.ptr, #blocked1> - %28 = tt.expand_dims %arg5 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %29 = arith.extsi %28 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %30 = arith.muli %29, %cst_1 : tensor<1x32xi64, #blocked1> - %31 = tt.broadcast %30 : tensor<1x32xi64, #blocked1> -> tensor<128x32xi64, #blocked1> - %32 = tt.addptr %arg4, %31 : tensor<128x32x!tt.ptr, #blocked1>, tensor<128x32xi64, #blocked1> - %33 = tt.addptr %arg6, %c32_i32 : !tt.ptr, i32 - %34 = triton_gpu.convert_layout %27 : tensor<128x32xf16, #blocked1> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %35 = triton_gpu.convert_layout %14 : tensor<32x128xf16, #blocked> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %36 = tt.dot %34, %35, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> - %37 = tt.addptr %arg8, %cst : tensor<32x128x!tt.ptr, #blocked>, tensor<32x128xi64, #blocked> - scf.yield %32, %17, %33, %36, %37 : tensor<128x32x!tt.ptr, #blocked1>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>, !tt.ptr, tensor<128x128xf32, #mma>, tensor<32x128x!tt.ptr, #blocked> - } - tt.return %7#3 : tensor<128x128xf32, #mma> - } -} - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @load_two_users -// CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} -// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_23:.*]] = tt.load %{{.*}}, %[[SPLAT_22]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG7]] -// CHECK: %[[DOT_29:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[LOCAL_LOAD_28]], %{{.*}} -// CHECK: %[[TRUNCF_30:.*]] = arith.truncf %[[DOT_29]] -// CHECK: %[[CONVERT_LAYOUT_31:.*]] = triton_gpu.convert_layout %[[TRUNCF_30]] -// CHECK: %[[TRANS_32:.*]] = tt.trans %[[ARG7]] {order = array} -// CHECK: %[[LOCAL_LOAD_33:.*]] = triton_gpu.local_load %[[TRANS_32]] -// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_31]], %[[LOCAL_LOAD_33]], %[[ARG4]] -// CHECK: %[[ADDI_35:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}} -// CHECK: %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_37]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_23]], %[[MEMDESC_SUBVIEW_38]] -// CHECK: scf.yield %[[DOT_29]], %[[DOT_34]], %[[SELECT_26]], %[[SELECT_37]], %[[MEMDESC_SUBVIEW_38]] -// CHECK: } - - tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %c7_i32 = arith.constant 7 : i32 - %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %c0_i64 = arith.constant 0 : i64 - %2 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %3 = tt.splat %2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %4 = tt.addptr %3, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %5 = tt.broadcast %1 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %6 = tt.broadcast %4 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %7 = tt.addptr %6, %5 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %8 = tt.load %7 : tensor<64x16x!tt.ptr, #blocked> - %9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %10 = tt.expand_dims %9 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %11 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %12 = tt.splat %11 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %13 = tt.addptr %12, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %14 = tt.broadcast %10 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %15 = tt.broadcast %13 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %16 = tt.addptr %15, %14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %17 = tt.load %16 : tensor<128x64x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %18 = triton_gpu.local_alloc : () -> !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %19 = triton_gpu.memdesc_subview %18[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %8, %19 : tensor<64x16xf16, #blocked> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %20:5 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2, %arg5 = %c-1_i32, %arg6 = %c0_i32, %arg7 = %19) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable>) : i32 { - %21 = arith.cmpi slt, %arg2, %c7_i32 : i32 - %22 = tt.splat %21 : i1 -> tensor<64x16xi1, #blocked> - %23 = tt.load %7, %22 : tensor<64x16x!tt.ptr, #blocked> - %24 = arith.addi %arg5, %c1_i32 : i32 - %25 = arith.cmpi slt, %24, %c1_i32 : i32 - %26 = arith.select %25, %24, %c0_i32 : i32 - %27 = triton_gpu.convert_layout %17 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %28 = triton_gpu.local_load %arg7 : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %29 = tt.dot %27, %28, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %30 = arith.truncf %29 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %31 = triton_gpu.convert_layout %30 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %32 = tt.trans %arg7 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> - %33 = triton_gpu.local_load %32 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %34 = tt.dot %31, %33, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - %35 = arith.addi %arg6, %c1_i32 : i32 - %36 = arith.cmpi slt, %35, %c1_i32 : i32 - %37 = arith.select %36, %35, %c0_i32 : i32 - %38 = triton_gpu.memdesc_subview %18[%37, %c0_i32, %c0_i32] : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %23, %38 : tensor<64x16xf16, #blocked> -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %29, %34, %26, %37, %38 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %18 : !tt.memdesc<1x64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - tt.return %20#0, %20#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts -// CHECK: %{{.*}}:5 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} -// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[LOAD_21:.*]] = tt.load %{{.*}}, %[[SPLAT_20]] -// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} -// CHECK: %[[ADDI_25:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ADDI_25]], %{{.*}} -// CHECK: %[[SELECT_27:.*]] = arith.select %[[CMPI_26]], %[[ADDI_25]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[ARG7]] -// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_28]], %[[CONVERT_LAYOUT_29]], %{{.*}} -// CHECK: %[[TRUNCF_31:.*]] = arith.truncf %[[DOT_30]] -// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[TRUNCF_31]] -// CHECK: %[[LOCAL_ALLOC_33:.*]] = triton_gpu.local_alloc %[[ARG7]] -// CHECK: %[[TRANS_34:.*]] = tt.trans %[[LOCAL_ALLOC_33]] {order = array} -// CHECK: %[[LOCAL_LOAD_35:.*]] = triton_gpu.local_load %[[TRANS_34]] -// CHECK: %[[DOT_36:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[LOCAL_LOAD_35]], %[[ARG4]] -// CHECK: scf.yield %[[DOT_30]], %[[DOT_36]], %[[SELECT_24]], %[[SELECT_27]], %[[LOAD_21]] -// CHECK: } - - tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %c7_i32 = arith.constant 7 : i32 - %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %c0_i64 = arith.constant 0 : i64 - %2 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %3 = tt.splat %2 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %4 = tt.addptr %3, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %5 = tt.broadcast %1 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %6 = tt.broadcast %4 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %7 = tt.addptr %6, %5 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %8 = tt.load %7 : tensor<64x16x!tt.ptr, #blocked> - %9 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %10 = tt.expand_dims %9 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %11 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %12 = tt.splat %11 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %13 = tt.addptr %12, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %14 = tt.broadcast %10 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %15 = tt.broadcast %13 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %16 = tt.addptr %15, %14 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %17 = tt.load %16 : tensor<128x64x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %18:5 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2, %arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %8) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, tensor<64x16xf16, #blocked>) : i32 { - %19 = arith.cmpi slt, %arg2, %c7_i32 : i32 - %20 = tt.splat %19 : i1 -> tensor<64x16xi1, #blocked> - %21 = tt.load %7, %20 : tensor<64x16x!tt.ptr, #blocked> - %22 = arith.addi %arg5, %c1_i32 : i32 - %23 = arith.cmpi slt, %22, %c1_i32 : i32 - %24 = arith.select %23, %22, %c0_i32 : i32 - %25 = arith.addi %arg6, %c1_i32 : i32 - %26 = arith.cmpi slt, %25, %c1_i32 : i32 - %27 = arith.select %26, %25, %c0_i32 : i32 - %28 = triton_gpu.convert_layout %17 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %29 = triton_gpu.convert_layout %arg7 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %30 = tt.dot %28, %29, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %31 = arith.truncf %30 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %32 = triton_gpu.convert_layout %31 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %33 = triton_gpu.local_alloc %arg7 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %34 = tt.trans %33 {order = array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> - %35 = triton_gpu.local_load %34 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %36 = tt.dot %32, %35, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %30, %36, %24, %27, %21 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>, i32, i32, tensor<64x16xf16, #blocked> - } - tt.return %18#0, %18#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @nested_loops -// CHECK: scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { - -// CHECK: %[[MULI_9:.*]] = arith.muli %[[ARG4]], %{{.*}} -// CHECK: %[[SPLAT_10:.*]] = tt.splat %[[MULI_9]] -// CHECK: %[[ADDI_11:.*]] = arith.addi %[[SPLAT_10]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_12:.*]] = tt.expand_dims %[[ADDI_11]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_13:.*]] = tt.broadcast %[[EXPAND_DIMS_12]] -// CHECK: %[[ADDPTR_14:.*]] = tt.addptr %{{.*}}, %[[BROADCAST_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_14]] -// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[MULI_9]] -// CHECK: %[[ADDI_17:.*]] = arith.addi %[[SPLAT_16]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_18:.*]] = tt.expand_dims %[[ADDI_17]] {axis = 1 : i32} -// CHECK: %[[MULI_19:.*]] = arith.muli %[[EXPAND_DIMS_18]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_20:.*]] = tt.expand_dims %{{.*}} {axis = 0 : i32} -// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %{{.*}}, %[[MULI_19]] -// CHECK: %[[BROADCAST_22:.*]] = tt.broadcast %[[EXPAND_DIMS_20]] -// CHECK: %[[BROADCAST_23:.*]] = tt.broadcast %[[ADDPTR_21]] -// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[BROADCAST_23]], %[[BROADCAST_22]] -// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}, %[[MULI_19]] -// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[ADDPTR_26]] -// CHECK: %[[LOCAL_ALLOC_28:.*]] = triton_gpu.local_alloc -// CHECK: %[[MEMDESC_SUBVIEW_29:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_28]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_29]] -// CHECK: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[MEMDESC_SUBVIEW_29]], %[[ARG9:.*]] = %[[BROADCAST_22]]) -// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ARG5]], %{{.*}} -// CHECK: %[[ADDI_32:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[MULI_33:.*]] = arith.muli %[[ADDI_32]], %{{.*}} -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[MULI_33]] -// CHECK: %[[ADDI_35:.*]] = arith.addi %[[SPLAT_34]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ADDI_35]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// CHECK: %[[SPLAT_38:.*]] = tt.splat %[[CMPI_31]] -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %[[BROADCAST_23]], %[[BROADCAST_37]] -// CHECK: %[[LOAD_40:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_38]] -// CHECK: %[[ADDI_41:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_42:.*]] = arith.cmpi slt, %[[ADDI_41]], %{{.*}} -// CHECK: %[[SELECT_43:.*]] = arith.select %[[CMPI_42]], %[[ADDI_41]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_44:.*]] = triton_gpu.local_load %[[ARG8]] -// CHECK: %[[CONVERT_LAYOUT_45:.*]] = triton_gpu.convert_layout %[[LOAD_15]] -// CHECK: %[[DOT_46:.*]] = tt.dot %[[LOCAL_LOAD_44]], %[[CONVERT_LAYOUT_45]], %{{.*}} -// CHECK: %[[ADDPTR_47:.*]] = tt.addptr %[[BROADCAST_27]], %[[ARG9]] -// CHECK: %[[CONVERT_LAYOUT_48:.*]] = triton_gpu.convert_layout %[[DOT_46]] -// CHECK: tt.store %[[ADDPTR_47]], %[[CONVERT_LAYOUT_48]] -// CHECK: %[[ADDI_49:.*]] = arith.addi %[[ARG7]], %{{.*}} -// CHECK: %[[CMPI_50:.*]] = arith.cmpi slt, %[[ADDI_49]], %{{.*}} -// CHECK: %[[SELECT_51:.*]] = arith.select %[[CMPI_50]], %[[ADDI_49]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_52:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_28]][%[[SELECT_51]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_40]], %[[MEMDESC_SUBVIEW_52]] -// CHECK: scf.yield %[[SELECT_43]], %[[SELECT_51]], %[[MEMDESC_SUBVIEW_52]], %[[BROADCAST_37]] -// CHECK: } - - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %c9_i32 = arith.constant 9 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %c10_i32 = arith.constant 10 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> - %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { - %9 = arith.muli %arg4, %c32_i32 : i32 - %10 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %14 = arith.muli %13, %cst_0 : tensor<32x1xi32, #blocked> - %15 = tt.addptr %7, %14 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %16 = tt.broadcast %10 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %17 = tt.broadcast %15 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %18 = tt.addptr %17, %16 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %19 = tt.load %18 : tensor<32x32x!tt.ptr, #blocked> - %20 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %21 = arith.addi %20, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %22 = tt.expand_dims %21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %23 = tt.broadcast %22 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %24 = tt.addptr %6, %23 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %25 = tt.load %24 : tensor<32x32x!tt.ptr, #blocked> - %26 = tt.addptr %8, %14 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %27 = tt.broadcast %26 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %28 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - %29 = triton_gpu.memdesc_subview %28[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %19, %29 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - %30:4 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %c-1_i32, %arg7 = %c0_i32, %arg8 = %29, %arg9 = %16) -> (i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>, tensor<32x32xi32, #blocked>) : i32 { - %31 = arith.cmpi slt, %arg5, %c9_i32 : i32 - %32 = arith.addi %arg5, %c1_i32 : i32 - %33 = arith.muli %32, %c32_i32 : i32 - %34 = tt.splat %33 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %35 = arith.addi %34, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %36 = tt.expand_dims %35 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %37 = tt.broadcast %36 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %38 = tt.splat %31 : i1 -> tensor<32x32xi1, #blocked> - %39 = tt.addptr %17, %37 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %40 = tt.load %39, %38 : tensor<32x32x!tt.ptr, #blocked> - %41 = arith.addi %arg6, %c1_i32 : i32 - %42 = arith.cmpi slt, %41, %c1_i32 : i32 - %43 = arith.select %42, %41, %c0_i32 : i32 - %44 = triton_gpu.local_load %arg8 : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %45 = triton_gpu.convert_layout %25 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %46 = tt.dot %44, %45, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %47 = tt.addptr %27, %arg9 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %48 = triton_gpu.convert_layout %46 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %47, %48 : tensor<32x32x!tt.ptr, #blocked> - %49 = arith.addi %arg7, %c1_i32 : i32 - %50 = arith.cmpi slt, %49, %c1_i32 : i32 - %51 = arith.select %50, %49, %c0_i32 : i32 - %52 = triton_gpu.memdesc_subview %28[%51, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %40, %52 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %43, %51, %52, %37 : i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>, tensor<32x32xi32, #blocked> - } - triton_gpu.local_dealloc %28 : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - } - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> -#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de -// CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_76:.*]] = arith.cmpi slt, %[[ARG6]], %{{.*}} -// CHECK: %[[SPLAT_77:.*]] = tt.splat %[[CMPI_76]] -// CHECK: %[[LOAD_78:.*]] = tt.load %{{.*}}, %[[SPLAT_77]] -// CHECK: %[[SPLAT_79:.*]] = tt.splat %[[CMPI_76]] -// CHECK: %[[LOAD_80:.*]] = tt.load %{{.*}}, %[[SPLAT_79]] -// CHECK: %[[ADDI_81:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_82:.*]] = arith.cmpi slt, %[[ADDI_81]], %{{.*}} -// CHECK: %[[SELECT_83:.*]] = arith.select %[[CMPI_82]], %[[ADDI_81]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_84:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[TRANS_85:.*]] = tt.trans %[[ARG10]] {order = array} -// CHECK: %[[LOCAL_LOAD_86:.*]] = triton_gpu.local_load %[[TRANS_85]] -// CHECK: %[[DOT_87:.*]] = tt.dot %[[CONVERT_LAYOUT_84]], %[[LOCAL_LOAD_86]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_88:.*]] = triton_gpu.convert_layout %[[DOT_87]] -// CHECK: %[[LOCAL_LOAD_89:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[DOT_90:.*]] = tt.dot %[[CONVERT_LAYOUT_88]], %[[LOCAL_LOAD_89]], %[[ARG7]] -// CHECK: %[[ADDI_91:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_92:.*]] = arith.cmpi slt, %[[ADDI_91]], %{{.*}} -// CHECK: %[[SELECT_93:.*]] = arith.select %[[CMPI_92]], %[[ADDI_91]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_94:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_93]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_78]], %[[MEMDESC_SUBVIEW_94]] -// CHECK: %[[MEMDESC_SUBVIEW_95:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_93]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_80]], %[[MEMDESC_SUBVIEW_95]] -// CHECK: scf.yield %[[DOT_90]], %[[SELECT_83]], %[[SELECT_93]], %[[MEMDESC_SUBVIEW_94]], %[[MEMDESC_SUBVIEW_95]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %{{.*}} -// CHECK: triton_gpu.local_dealloc %{{.*}} -// CHECK: %[[BROADCAST_70:.*]] = tt.broadcast %{{.*}} -// CHECK: %[[BROADCAST_71:.*]] = tt.broadcast %{{.*}} -// CHECK: %[[ADDI_72:.*]] = arith.addi %[[BROADCAST_70]], %[[BROADCAST_71]] -// CHECK: %[[SPLAT_73:.*]] = tt.splat %{{.*}} -// CHECK: %[[ADDPTR_74:.*]] = tt.addptr %[[SPLAT_73]], %[[ADDI_72]] -// CHECK: %[[CONVERT_LAYOUT_75:.*]] = triton_gpu.convert_layout %{{.*}}#0 -// CHECK: tt.store %[[ADDPTR_74]], %[[CONVERT_LAYOUT_75]] - - tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { - %c1_i32 = arith.constant 1 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> - %2 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %3 = arith.muli %2, %1 : tensor<1x32xi32, #blocked> - %4 = arith.extsi %3 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> - %5 = tt.get_program_id y : i32 - %6 = arith.muli %5, %arg5 : i32 - %7 = arith.extsi %6 : i32 to i64 - %8 = arith.extsi %arg5 : i32 to i64 - %9 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %11 = tt.load %arg3 : !tt.ptr - %12 = arith.extsi %10 : tensor<32x1xi32, #blocked> to tensor<32x1xi64, #blocked> - %13 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked> - %14 = tt.splat %8 : i64 -> tensor<32x1xi64, #blocked> - %15 = arith.addi %13, %12 : tensor<32x1xi64, #blocked> - %16 = tt.splat %7 : i64 -> tensor<32x1xi64, #blocked> - %17 = arith.muli %15, %14 : tensor<32x1xi64, #blocked> - %18 = arith.addi %17, %16 : tensor<32x1xi64, #blocked> - %19 = tt.broadcast %4 : tensor<1x32xi64, #blocked> -> tensor<32x32xi64, #blocked> - %20 = tt.broadcast %18 : tensor<32x1xi64, #blocked> -> tensor<32x32xi64, #blocked> - %21 = arith.addi %20, %19 : tensor<32x32xi64, #blocked> - %22 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %23 = tt.addptr %22, %21 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi64, #blocked> - %24 = tt.load %23 : tensor<32x32x!tt.ptr, #blocked> - %25 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %26 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> - %27 = tt.expand_dims %25 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> - %28 = arith.muli %27, %26 : tensor<1x64xi32, #blocked> - %29 = arith.extsi %28 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> - %30 = tt.broadcast %29 : tensor<1x64xi64, #blocked> -> tensor<32x64xi64, #blocked> - %31 = tt.broadcast %18 : tensor<32x1xi64, #blocked> -> tensor<32x64xi64, #blocked> - %32 = arith.addi %31, %30 : tensor<32x64xi64, #blocked> - %33 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked> - %34 = tt.addptr %33, %32 : tensor<32x64x!tt.ptr, #blocked>, tensor<32x64xi64, #blocked> - %35 = tt.load %34 : tensor<32x64x!tt.ptr, #blocked> - %36 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %37 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> - %38 = tt.expand_dims %36 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %39 = arith.muli %38, %37 : tensor<1x64xi32, #blocked1> - %40 = arith.extsi %39 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> - %c64_i32 = arith.constant 64 : i32 - %41 = tt.get_program_id x : i32 - %42 = arith.muli %41, %c64_i32 : i32 - %43 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %44 = tt.splat %42 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %45 = arith.addi %44, %43 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %46 = tt.expand_dims %45 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1> - %47 = arith.extsi %46 : tensor<64x1xi32, #blocked1> to tensor<64x1xi64, #blocked1> - %48 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked1> - %49 = tt.splat %8 : i64 -> tensor<64x1xi64, #blocked1> - %50 = arith.addi %48, %47 : tensor<64x1xi64, #blocked1> - %51 = tt.splat %7 : i64 -> tensor<64x1xi64, #blocked1> - %52 = arith.muli %50, %49 : tensor<64x1xi64, #blocked1> - %53 = arith.addi %52, %51 : tensor<64x1xi64, #blocked1> - %54 = tt.broadcast %40 : tensor<1x64xi64, #blocked1> -> tensor<64x64xi64, #blocked1> - %55 = tt.broadcast %53 : tensor<64x1xi64, #blocked1> -> tensor<64x64xi64, #blocked1> - %56 = arith.addi %55, %54 : tensor<64x64xi64, #blocked1> - %57 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked1> - %58 = tt.addptr %57, %56 : tensor<64x64x!tt.ptr, #blocked1>, tensor<64x64xi64, #blocked1> - %59 = tt.load %58 : tensor<64x64x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %60 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %61 = tt.expand_dims %60 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %62 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> - %63 = arith.muli %61, %62 : tensor<1x32xi32, #blocked1> - %64 = arith.extsi %63 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %65 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - %66 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - %67 = triton_gpu.memdesc_subview %65[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %35, %67 : tensor<32x64xf32, #blocked> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - %68 = triton_gpu.memdesc_subview %66[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %24, %68 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - %69:5 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %67, %arg11 = %68) -> (tensor<64x32xf32, #mma>, i32, i32, !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable>) : i32 { - %76 = arith.cmpi slt, %arg6, %c32_i32 : i32 - %77 = tt.splat %76 : i1 -> tensor<32x32xi1, #blocked> - %78 = tt.load %23, %77 : tensor<32x32x!tt.ptr, #blocked> - %79 = tt.splat %76 : i1 -> tensor<32x64xi1, #blocked> - %80 = tt.load %34, %79 : tensor<32x64x!tt.ptr, #blocked> - %81 = arith.addi %arg8, %c1_i32 : i32 - %82 = arith.cmpi slt, %81, %c1_i32 : i32 - %83 = arith.select %82, %81, %c0_i32 : i32 - %84 = triton_gpu.convert_layout %59 : tensor<64x64xf32, #blocked1> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %85 = tt.trans %arg10 {order = array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory, mutable> - %86 = triton_gpu.local_load %85 : !tt.memdesc<64x32xf32, #shared2, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %87 = tt.dot %84, %86, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - %88 = triton_gpu.convert_layout %87 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %89 = triton_gpu.local_load %arg11 : !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %90 = tt.dot %88, %89, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - %91 = arith.addi %arg9, %c1_i32 : i32 - %92 = arith.cmpi slt, %91, %c1_i32 : i32 - %93 = arith.select %92, %91, %c0_i32 : i32 - %94 = triton_gpu.memdesc_subview %65[%93, %c0_i32, %c0_i32] : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %80, %94 : tensor<32x64xf32, #blocked> -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - %95 = triton_gpu.memdesc_subview %66[%93, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %78, %95 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - scf.yield %90, %83, %93, %94, %95 : tensor<64x32xf32, #mma>, i32, i32, !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %65 : !tt.memdesc<1x32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %66 : !tt.memdesc<1x32x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - %70 = tt.broadcast %53 : tensor<64x1xi64, #blocked1> -> tensor<64x32xi64, #blocked1> - %71 = tt.broadcast %64 : tensor<1x32xi64, #blocked1> -> tensor<64x32xi64, #blocked1> - %72 = arith.addi %70, %71 : tensor<64x32xi64, #blocked1> - %73 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked1> - %74 = tt.addptr %73, %72 : tensor<64x32x!tt.ptr, #blocked1>, tensor<64x32xi64, #blocked1> - %75 = triton_gpu.convert_layout %69#0 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked1> - tt.store %74, %75 : tensor<64x32x!tt.ptr, #blocked1> - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @indirect_load_shared_layout -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_21]] -// CHECK: %[[SPLAT_23:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_24:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_25:.*]] = tt.load %[[ADDPTR_24]], %[[SPLAT_23]] -// CHECK: %[[EXPAND_DIMS_26:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_27:.*]] = tt.broadcast %[[EXPAND_DIMS_26]] -// CHECK: %[[MULI_28:.*]] = arith.muli %{{.*}}, %[[BROADCAST_27]] -// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %{{.*}}, %[[MULI_28]] -// CHECK: %[[LOAD_31:.*]] = tt.load %[[ADDPTR_30]], %[[SPLAT_29]] -// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_32]] -// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] -// CHECK: %[[ADDI_36:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_37:.*]] = arith.cmpi slt, %[[ADDI_36]], %{{.*}} -// CHECK: %[[SELECT_38:.*]] = arith.select %[[CMPI_37]], %[[ADDI_36]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_39:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_40:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_39]], %[[LOCAL_LOAD_40]], %[[ARG7]] -// CHECK: %[[ADDI_42:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_43:.*]] = arith.cmpi slt, %[[ADDI_42]], %{{.*}} -// CHECK: %[[SELECT_44:.*]] = arith.select %[[CMPI_43]], %[[ADDI_42]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_45:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_25]], %[[MEMDESC_SUBVIEW_45]] -// CHECK: %[[MEMDESC_SUBVIEW_46:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_44]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_31]], %[[MEMDESC_SUBVIEW_46]] -// CHECK: scf.yield %[[DOT_41]], %[[ADDPTR_24]], %[[ADDPTR_34]], %[[SELECT_38]], %[[SELECT_44]], %[[MEMDESC_SUBVIEW_45]], %[[MEMDESC_SUBVIEW_46]], %[[LOAD_35]] -// CHECK: } - - tt.func @indirect_load_shared_layout(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { - %c2 = arith.constant 2 : index - %c1 = arith.constant 1 : index - %0 = arith.cmpi sgt, %arg1, %c1 : index - %cst = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.splat %0 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.addptr %arg3, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %3 = tt.load %2, %1 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %c0 = arith.constant 0 : index - %4 = arith.cmpi sgt, %arg1, %c0 : index - %5 = tt.splat %4 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = tt.load %arg3, %5 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %8 = tt.broadcast %7 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %9 = arith.muli %arg0, %8 : tensor<16x16xi64, #blocked> - %10 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked> - %11 = tt.addptr %arg5, %9 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %12 = tt.load %11, %10 : tensor<16x16x!tt.ptr, #blocked> - %13 = tt.splat %4 : i1 -> tensor<16x16xi1, #blocked1> - %14 = tt.load %arg2, %13 : tensor<16x16x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %14, %17 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %18 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %12, %18 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %19:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_0, %arg8 = %arg2, %arg9 = %2, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %17, %arg13 = %18, %arg14 = %3) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { - %20 = arith.subi %arg1, %c2 : index - %21 = arith.cmpi slt, %arg6, %20 : index - %22 = tt.splat %21 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %23 = tt.addptr %arg9, %cst : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %24 = tt.load %23, %22 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %25 = arith.subi %arg1, %c1 : index - %26 = arith.cmpi slt, %arg6, %25 : index - %27 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %28 = tt.broadcast %27 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %29 = arith.muli %arg0, %28 : tensor<16x16xi64, #blocked> - %30 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked> - %31 = tt.addptr %arg5, %29 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %32 = tt.load %31, %30 : tensor<16x16x!tt.ptr, #blocked> - %33 = tt.splat %26 : i1 -> tensor<16x16xi1, #blocked1> - %34 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %35 = tt.load %34, %33 : tensor<16x16x!tt.ptr, #blocked1> - %36 = arith.addi %arg10, %c1_i32 : i32 - %37 = arith.cmpi slt, %36, %c1_i32 : i32 - %38 = arith.select %37, %36, %c0_i32 : i32 - %39 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %40 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %41 = tt.dot %39, %40, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> - %42 = arith.addi %arg11, %c1_i32 : i32 - %43 = arith.cmpi slt, %42, %c1_i32 : i32 - %44 = arith.select %43, %42, %c0_i32 : i32 - %45 = triton_gpu.memdesc_subview %15[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %35, %45 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %46 = triton_gpu.memdesc_subview %16[%44, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %32, %46 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %41, %34, %23, %38, %44, %45, %46, %24 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } - triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %16 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - tt.return %19#0 : tensor<16x16xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:86", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @kernel_yield_constant -// CHECK: %{{.*}}:4 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[ADDI_18:.*]] = arith.addi %[[ARG7]], %{{.*}} -// CHECK: %[[MULI_19:.*]] = arith.muli %[[ADDI_18]], %{{.*}} -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %[[MULI_19]] -// CHECK: %[[SPLAT_21:.*]] = tt.splat %[[SUBI_20]] -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_21]] -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_17]] -// CHECK: %[[MULI_24:.*]] = arith.muli %[[MULI_19]], %{{.*}} -// CHECK: %[[BROADCAST_25:.*]] = tt.broadcast %[[CMPI_22]] -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[MULI_24]] -// CHECK: %[[ANDI_28:.*]] = arith.andi %[[SPLAT_26]], %[[BROADCAST_25]] -// CHECK: %[[ADDPTR_29:.*]] = tt.addptr %{{.*}}, %[[SPLAT_27]] -// CHECK: %[[LOAD_30:.*]] = tt.load %[[ADDPTR_29]], %[[ANDI_28]], %{{.*}} -// CHECK: %[[ADDI_31:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_32:.*]] = arith.cmpi slt, %[[ADDI_31]], %{{.*}} -// CHECK: %[[SELECT_33:.*]] = arith.select %[[CMPI_32]], %[[ADDI_31]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_34:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[DOT_35:.*]] = tt.dot %{{.*}}, %[[LOCAL_LOAD_34]], %[[ARG8]] -// CHECK: %[[CONVERT_LAYOUT_36:.*]] = triton_gpu.convert_layout %[[DOT_35]] -// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_36]] -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_30]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: scf.yield %{{.*}}, %[[SELECT_33]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: } - - tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0> : tensor<32x32xi32, #blocked> - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked> - %2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = arith.cmpi slt, %2, %1 : tensor<32x1xi32, #blocked> - %c31_i32 = arith.constant 31 : i32 - %c32_i32 = arith.constant 32 : i32 - %4 = arith.addi %arg4, %c31_i32 : i32 - %c0_i32 = arith.constant 0 : i32 - %5 = arith.divsi %4, %c32_i32 : i32 - %6 = arith.cmpi sgt, %5, %c0_i32 : i32 - %7 = tt.broadcast %3 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> - %8 = tt.splat %6 : i1 -> tensor<32x32xi1, #blocked> - %9 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> - %10 = arith.andi %8, %7 : tensor<32x32xi1, #blocked> - %11 = tt.addptr %9, %cst : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %12 = tt.load %11, %10, %cst_0 : tensor<32x32x!tt.ptr, #blocked> - %c-1_i32 = arith.constant -1 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst_2 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %cst_3 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %13 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %14 = triton_gpu.local_alloc : () -> !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - %15 = triton_gpu.memdesc_subview %14[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %12, %15 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - %16:4 = scf.for %arg7 = %c0_i32 to %5 step %c1_i32 iter_args(%arg8 = %cst_1, %arg9 = %c-1_i32, %arg10 = %c0_i32, %arg11 = %15) -> (tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable>) : i32 { - %17 = arith.subi %5, %c1_i32 : i32 - %18 = arith.addi %arg7, %c1_i32 : i32 - %19 = arith.muli %18, %c32_i32 : i32 - %20 = arith.subi %arg4, %19 : i32 - %21 = tt.splat %20 : i32 -> tensor<32x1xi32, #blocked> - %22 = arith.cmpi slt, %2, %21 : tensor<32x1xi32, #blocked> - %23 = arith.cmpi slt, %arg7, %17 : i32 - %24 = tt.broadcast %22 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> - %25 = tt.splat %23 : i1 -> tensor<32x32xi1, #blocked> - %26 = arith.muli %19, %arg5 : i32 - %27 = tt.splat %26 : i32 -> tensor<32x32xi32, #blocked> - %28 = arith.andi %25, %24 : tensor<32x32xi1, #blocked> - %29 = tt.addptr %9, %27 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %30 = tt.load %29, %28, %cst_0 : tensor<32x32x!tt.ptr, #blocked> - %31 = arith.addi %arg9, %c1_i32 : i32 - %32 = arith.cmpi slt, %31, %c1_i32 : i32 - %33 = arith.select %32, %31, %c0_i32 : i32 - %34 = triton_gpu.local_load %arg11 : !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %35 = tt.dot %cst_3, %34, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %36 = triton_gpu.convert_layout %35 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %13, %36 : tensor<32x32x!tt.ptr, #blocked> - %37 = arith.addi %arg10, %c1_i32 : i32 - %38 = arith.cmpi slt, %37, %c1_i32 : i32 - %39 = arith.select %38, %37, %c0_i32 : i32 - %40 = triton_gpu.memdesc_subview %14[%39, %c0_i32, %c0_i32] : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %30, %40 : tensor<32x32xf32, #blocked> -> !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %cst_2, %33, %39, %40 : tensor<32x32xf32, #mma>, i32, i32, !tt.memdesc<32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %14 : !tt.memdesc<1x32x32xf32, #shared, #triton_gpu.shared_memory, mutable> - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @add_kernel -// CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG4]], %{{.*}} -// CHECK: %[[ADDI_25:.*]] = arith.addi %{{.*}}, %[[ADDI_24]] -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[ADDI_25]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[SPLAT_26]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[ANDI_30:.*]] = arith.andi %[[SPLAT_29]], %[[CMPI_28]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] -// CHECK: %[[LOAD_32:.*]] = tt.load %[[ADDPTR_31]], %[[ANDI_30]] -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[ANDI_34:.*]] = arith.andi %[[SPLAT_33]], %[[CMPI_28]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[ANDI_34]] -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[ADDI_40:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_41:.*]] = arith.cmpi slt, %[[ADDI_40]], %{{.*}} -// CHECK: %[[SELECT_42:.*]] = arith.select %[[CMPI_41]], %[[ADDI_40]], %{{.*}} -// CHECK: %[[ADDF_43:.*]] = arith.addf %[[ARG7]], %[[ARG9]] -// CHECK: %[[ADDPTR_44:.*]] = tt.addptr %{{.*}}, %[[ARG11]] -// CHECK: tt.store %[[ADDPTR_44]], %[[ADDF_43]], %[[ARG13]] -// CHECK: scf.yield %[[SELECT_39]], %[[SELECT_42]], %[[ARG8]], %[[LOAD_32]], %[[ARG10]], %[[LOAD_36]], %[[ARG12]], %[[ADDI_27]], %[[ARG14]], %[[CMPI_28]] -// CHECK: } - - tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c2048_i32 = arith.constant 2048 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %0 = tt.get_program_id x : i32 - %c1024_i32 = arith.constant 1024 : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = arith.addi %1, %c1024_i32 : i32 - %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %4 = tt.splat %2 : i32 -> tensor<1024xi32, #blocked> - %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %6 = arith.addi %4, %3 : tensor<1024xi32, #blocked> - %7 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %8 = arith.cmpi slt, %6, %5 : tensor<1024xi32, #blocked> - %9 = tt.addptr %7, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %10 = tt.load %9, %8 : tensor<1024x!tt.ptr, #blocked> - %11 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %12 = tt.addptr %11, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %13 = tt.load %12, %8 : tensor<1024x!tt.ptr, #blocked> - %14 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> - %15 = arith.addi %14, %3 : tensor<1024xi32, #blocked> - %16 = arith.cmpi slt, %15, %5 : tensor<1024xi32, #blocked> - %17 = tt.addptr %7, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %18 = tt.load %17, %16 : tensor<1024x!tt.ptr, #blocked> - %19 = tt.addptr %11, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %20 = tt.load %19, %16 : tensor<1024x!tt.ptr, #blocked> - %c1014752_i32 = arith.constant 1014752 : i32 - %c2_i32 = arith.constant 2 : i32 - %c1_i32 = arith.constant 1 : i32 - %c-1_i32 = arith.constant -1 : i32 - %c0_i32 = arith.constant 0 : i32 - %21 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %22:10 = scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %20, %arg8 = %13, %arg9 = %18, %arg10 = %10, %arg11 = %15, %arg12 = %6, %arg13 = %16, %arg14 = %8) -> (i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked>) : i32 { - %23 = arith.cmpi slt, %arg4, %c1014752_i32 : i32 - %24 = arith.addi %arg4, %c2048_i32 : i32 - %25 = arith.addi %1, %24 : i32 - %26 = tt.splat %25 : i32 -> tensor<1024xi32, #blocked> - %27 = arith.addi %26, %3 : tensor<1024xi32, #blocked> - %28 = arith.cmpi slt, %27, %5 : tensor<1024xi32, #blocked> - %29 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> - %30 = arith.andi %29, %28 : tensor<1024xi1, #blocked> - %31 = tt.addptr %7, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %32 = tt.load %31, %30 : tensor<1024x!tt.ptr, #blocked> - %33 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> - %34 = arith.andi %33, %28 : tensor<1024xi1, #blocked> - %35 = tt.addptr %11, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %36 = tt.load %35, %34 : tensor<1024x!tt.ptr, #blocked> - %37 = arith.addi %arg5, %c1_i32 : i32 - %38 = arith.cmpi slt, %37, %c2_i32 : i32 - %39 = arith.select %38, %37, %c0_i32 : i32 - %40 = arith.addi %arg6, %c1_i32 : i32 - %41 = arith.cmpi slt, %40, %c2_i32 : i32 - %42 = arith.select %41, %40, %c0_i32 : i32 - %43 = arith.addf %arg7, %arg9 : tensor<1024xf32, #blocked> - %44 = tt.addptr %21, %arg11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %44, %43, %arg13 : tensor<1024x!tt.ptr, #blocked> - scf.yield %39, %42, %arg8, %36, %arg10, %32, %arg12, %27, %arg14, %28 : i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked> - } - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [16, 4], warpsPerCTA = [2, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @nested_loops -// CHECK: scf.for %[[ARG1:.*]] = %{{.*}} to %{{.*}} step %{{.*}} : i32 { - -// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} -// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}} -// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc %[[LOAD_10]] -// CHECK: %[[TRANS_13:.*]] = tt.trans %[[LOCAL_ALLOC_12]] {order = array} -// CHECK: %[[LOCAL_LOAD_14:.*]] = triton_gpu.local_load %[[TRANS_13]] -// CHECK: %[[LOCAL_ALLOC_15:.*]] = triton_gpu.local_alloc -// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_15]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_11]], %[[MEMDESC_SUBVIEW_16]] -// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) - -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} -// CHECK: %[[SPLAT_19:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_20:.*]] = tt.load %{{.*}}, %[[SPLAT_19]] -// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG3]], %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} -// CHECK: %[[SELECT_23:.*]] = arith.select %[[CMPI_22]], %[[ADDI_21]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_24:.*]] = triton_gpu.local_load %[[ARG5]] -// CHECK: %[[DOT_25:.*]] = tt.dot %[[LOCAL_LOAD_24]], %[[LOCAL_LOAD_14]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[DOT_25]] -// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_26]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG4]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_30:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_15]][%[[SELECT_29]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_20]], %[[MEMDESC_SUBVIEW_30]] -// CHECK: scf.yield %[[SELECT_23]], %[[SELECT_29]], %[[MEMDESC_SUBVIEW_30]] -// CHECK: } - - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> - %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> - %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> - %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> - %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> - %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> - %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> - %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> - %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> - scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %11 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %12 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - %13 = tt.trans %12 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> - %14 = triton_gpu.local_load %13 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %15 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - %16 = triton_gpu.memdesc_subview %15[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %11, %16 : tensor<16x16xf32, #blocked> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - %17:3 = scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg3 = %c-1_i32, %arg4 = %c0_i32, %arg5 = %16) -> (i32, i32, !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable>) : i32 { - %18 = arith.cmpi slt, %arg2, %c1_i32 : i32 - %19 = tt.splat %18 : i1 -> tensor<16x16xi1, #blocked> - %20 = tt.load %9, %19 : tensor<16x16x!tt.ptr, #blocked> - %21 = arith.addi %arg3, %c1_i32 : i32 - %22 = arith.cmpi slt, %21, %c1_i32 : i32 - %23 = arith.select %22, %21, %c0_i32 : i32 - %24 = triton_gpu.local_load %arg5 : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %25 = tt.dot %24, %14, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> - %26 = triton_gpu.convert_layout %25 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> - tt.store %9, %26 : tensor<16x16x!tt.ptr, #blocked> - %27 = arith.addi %arg4, %c1_i32 : i32 - %28 = arith.cmpi slt, %27, %c1_i32 : i32 - %29 = arith.select %28, %27, %c0_i32 : i32 - %30 = triton_gpu.memdesc_subview %15[%29, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %20, %30 : tensor<16x16xf32, #blocked> -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %23, %29, %30 : i32, i32, !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %15 : !tt.memdesc<1x16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - } - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 4, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @load_convert_layout -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_24:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_28]], %[[SPLAT_27]] -// CHECK: %[[EXPAND_DIMS_30:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_31:.*]] = tt.broadcast %[[EXPAND_DIMS_30]] -// CHECK: %[[MULI_32:.*]] = arith.muli %{{.*}}, %[[BROADCAST_31]] -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[ADDPTR_34:.*]] = tt.addptr %{{.*}}, %[[MULI_32]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_33]] -// CHECK: %[[CMPI_36:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_24]] -// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_36]] -// CHECK: %[[ANDI_38:.*]] = arith.andi %[[SPLAT_37]], %{{.*}} -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[LOAD_40:.*]] = tt.load %[[ADDPTR_39]], %[[ANDI_38]] -// CHECK: %[[ADDI_41:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_42:.*]] = arith.cmpi slt, %[[ADDI_41]], %{{.*}} -// CHECK: %[[SELECT_43:.*]] = arith.select %[[CMPI_42]], %[[ADDI_41]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_44:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_45:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_46:.*]] = tt.dot %[[LOCAL_LOAD_44]], %[[LOCAL_LOAD_45]], %[[ARG7]] -// CHECK: %[[ADDI_47:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_48:.*]] = arith.cmpi slt, %[[ADDI_47]], %{{.*}} -// CHECK: %[[SELECT_49:.*]] = arith.select %[[CMPI_48]], %[[ADDI_47]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_50:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_49]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_50]] -// CHECK: %[[MEMDESC_SUBVIEW_51:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_49]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_51]] -// CHECK: scf.yield %[[DOT_46]], %[[ADDPTR_28]], %[[ADDPTR_39]], %[[SELECT_43]], %[[SELECT_49]], %[[MEMDESC_SUBVIEW_50]], %[[MEMDESC_SUBVIEW_51]], %[[LOAD_40]] -// CHECK: } - - tt.func @load_convert_layout(%arg0: tensor<16x16xi64, #blocked> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg1: index, %arg2: tensor<16x16x!tt.ptr, #blocked1> {tt.contiguity = 2 : i32, tt.divisibility = 16 : i32}, %arg3: tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg4: tensor<16x16xi32, #blocked1> {tt.constancy = 16 : i32, tt.divisibility = 16 : i32}, %arg5: tensor<16x16x!tt.ptr, #blocked> {tt.contiguity = 16 : i32, tt.divisibility = 16 : i32}) -> tensor<16x16xf32, #mma> { - %c2 = arith.constant 2 : index - %cst = arith.constant dense<2> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %c1 = arith.constant 1 : index - %1 = arith.cmpi sgt, %arg1, %c1 : index - %2 = arith.cmpi slt, %0, %cst : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %3 = tt.splat %1 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %cst_0 = arith.constant dense<1> : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %4 = arith.andi %3, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %5 = tt.addptr %arg3, %cst_0 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = tt.load %5, %4 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %c0 = arith.constant 0 : index - %7 = arith.cmpi sgt, %arg1, %c0 : index - %8 = tt.splat %7 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %9 = arith.andi %8, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %10 = tt.load %arg3, %9 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %11 = tt.expand_dims %10 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %12 = tt.broadcast %11 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %13 = arith.muli %arg0, %12 : tensor<16x16xi64, #blocked> - %14 = tt.splat %7 : i1 -> tensor<16x16xi1, #blocked> - %15 = tt.addptr %arg5, %13 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %16 = tt.load %15, %14 : tensor<16x16x!tt.ptr, #blocked> - %17 = tt.splat %7 : i1 -> tensor<16x16xi1, #blocked1> - %18 = tt.load %arg2, %17 : tensor<16x16x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %c1_i32 = arith.constant 1 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %19 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %20 = triton_gpu.local_alloc : () -> !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %21 = triton_gpu.memdesc_subview %19[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %18, %21 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %22 = triton_gpu.memdesc_subview %20[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %16, %22 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %23:8 = scf.for %arg6 = %c0 to %arg1 step %c1 iter_args(%arg7 = %cst_1, %arg8 = %arg2, %arg9 = %5, %arg10 = %c-1_i32, %arg11 = %c0_i32, %arg12 = %21, %arg13 = %22, %arg14 = %6) -> (tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) { - %24 = arith.subi %arg1, %c2 : index - %25 = arith.cmpi slt, %arg6, %24 : index - %26 = tt.splat %25 : i1 -> tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %27 = arith.andi %26, %2 : tensor<16xi1, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %28 = tt.addptr %arg9, %cst_0 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %29 = tt.load %28, %27 : tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %30 = arith.subi %arg1, %c1 : index - %31 = arith.cmpi slt, %arg6, %30 : index - %32 = tt.expand_dims %arg14 {axis = 1 : i32} : tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi64, #blocked> - %33 = tt.broadcast %32 : tensor<16x1xi64, #blocked> -> tensor<16x16xi64, #blocked> - %34 = arith.muli %arg0, %33 : tensor<16x16xi64, #blocked> - %35 = tt.splat %31 : i1 -> tensor<16x16xi1, #blocked> - %36 = tt.addptr %arg5, %34 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi64, #blocked> - %37 = tt.load %36, %35 : tensor<16x16x!tt.ptr, #blocked> - %38 = tt.splat %31 : i1 -> tensor<16x16xi1, #blocked1> - %39 = tt.addptr %arg8, %arg4 : tensor<16x16x!tt.ptr, #blocked1>, tensor<16x16xi32, #blocked1> - %40 = tt.load %39, %38 : tensor<16x16x!tt.ptr, #blocked1> - %41 = arith.addi %arg10, %c1_i32 : i32 - %42 = arith.cmpi slt, %41, %c1_i32 : i32 - %43 = arith.select %42, %41, %c0_i32 : i32 - %44 = triton_gpu.local_load %arg12 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %45 = triton_gpu.local_load %arg13 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %46 = tt.dot %44, %45, %arg7 : tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<16x16xf32, #mma> - %47 = arith.addi %arg11, %c1_i32 : i32 - %48 = arith.cmpi slt, %47, %c1_i32 : i32 - %49 = arith.select %48, %47, %c0_i32 : i32 - %50 = triton_gpu.memdesc_subview %19[%49, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %40, %50 : tensor<16x16xf16, #blocked1> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %51 = triton_gpu.memdesc_subview %20[%49, %c0_i32, %c0_i32] : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %37, %51 : tensor<16x16xf16, #blocked> -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %46, %39, %28, %43, %49, %50, %51, %29 : tensor<16x16xf32, #mma>, tensor<16x16x!tt.ptr, #blocked1>, tensor<16x!tt.ptr, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, i32, i32, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory, mutable>, tensor<16xi64, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - } - triton_gpu.local_dealloc %19 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_dealloc %20 : !tt.memdesc<1x16x16xf16, #shared, #triton_gpu.shared_memory, mutable> - tt.return %23#0 : tensor<16x16xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 2], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @matmul_indirect_pipeline -// CHECK: %{{.*}}:4 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[SPLAT_22:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[ADDPTR_23:.*]] = tt.addptr %{{.*}}, %[[ARG8]] -// CHECK: %[[LOAD_24:.*]] = tt.load %[[ADDPTR_23]], %[[SPLAT_22]] -// CHECK: %[[SPLAT_25:.*]] = tt.splat %[[CMPI_20]] -// CHECK: %[[LOAD_26:.*]] = tt.load %{{.*}}, %[[SPLAT_25]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// CHECK: %[[ADDI_30:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_31:.*]] = arith.cmpi slt, %[[ADDI_30]], %{{.*}} -// CHECK: %[[SELECT_32:.*]] = arith.select %[[CMPI_31]], %[[ADDI_30]], %{{.*}} -// CHECK: %[[EXPAND_DIMS_33:.*]] = tt.expand_dims %[[ARG7]] {axis = 0 : i32} -// CHECK: %[[BROADCAST_34:.*]] = tt.broadcast %[[EXPAND_DIMS_33]] -// CHECK: %[[ADDF_35:.*]] = arith.addf %{{.*}}, %[[BROADCAST_34]] -// CHECK: %[[CONVERT_LAYOUT_36:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_37:.*]] = triton_gpu.convert_layout %[[ADDF_35]] -// CHECK: %[[DOT_38:.*]] = tt.dot %[[CONVERT_LAYOUT_36]], %[[CONVERT_LAYOUT_37]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_39:.*]] = triton_gpu.convert_layout %[[DOT_38]] -// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_39]] -// CHECK: scf.yield %[[SELECT_29]], %[[SELECT_32]], %[[LOAD_24]], %[[LOAD_26]] -// CHECK: } - - tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %c-1_i32 = arith.constant -1 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %2 = tt.addptr %1, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %3 = tt.load %2 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %4 = tt.load %2 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %5 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %6 = tt.addptr %5, %4 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %7 = tt.load %6 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %8 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %9 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %10 = tt.expand_dims %9 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %11 = tt.broadcast %8 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %12 = tt.broadcast %10 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> - %13 = arith.addi %12, %11 : tensor<32x32xi32, #blocked> - %14 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %15 = tt.addptr %14, %13 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %16 = tt.load %15 : tensor<32x32x!tt.ptr, #blocked> - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %17 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %18 = tt.addptr %17, %13 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %19:4 = scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %7, %arg8 = %3) -> (i32, i32, tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>>) : i32 { - %20 = arith.cmpi slt, %arg4, %c0_i32 : i32 - %21 = tt.splat %20 : i1 -> tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %22 = tt.load %2, %21 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %23 = arith.cmpi slt, %arg4, %c1_i32 : i32 - %24 = tt.splat %23 : i1 -> tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %25 = tt.addptr %5, %arg8 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %26 = tt.load %25, %24 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %27 = arith.addi %arg5, %c1_i32 : i32 - %28 = arith.cmpi slt, %27, %c1_i32 : i32 - %29 = arith.select %28, %27, %c0_i32 : i32 - %30 = arith.addi %arg6, %c1_i32 : i32 - %31 = arith.cmpi slt, %30, %c1_i32 : i32 - %32 = arith.select %31, %30, %c0_i32 : i32 - %33 = tt.expand_dims %arg7 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> - %34 = tt.broadcast %33 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> - %35 = arith.addf %16, %34 : tensor<32x32xf32, #blocked> - %36 = triton_gpu.convert_layout %16 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %37 = triton_gpu.convert_layout %35 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %38 = tt.dot %36, %37, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %39 = triton_gpu.convert_layout %38 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %18, %39 : tensor<32x32x!tt.ptr, #blocked> - scf.yield %29, %32, %26, %22 : i32, i32, tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - } - tt.return - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = []}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, triton_gpu.target = "hip:gfx942"} { - -// CHECK-LABEL: tt.func @matmul_nested_ops -// CHECK: %{{.*}}:5 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_19:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[ADDPTR_21:.*]] = tt.addptr %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_19]] -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// CHECK: %[[SPLAT_24:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[IF_25:.*]] = scf.if %[[CMPI_23]] -> (tensor<128x32x!tt.ptr, #blocked1>) { - -// CHECK: %[[ADDPTR_37:.*]] = tt.addptr %[[ADDPTR_21]], %{{.*}} -// CHECK: scf.yield %[[ADDPTR_37]] -// CHECK: } else { - -// CHECK: scf.yield %[[ADDPTR_21]] -// CHECK: } - -// CHECK: %[[LOAD_26:.*]] = tt.load %[[IF_25]], %[[SPLAT_24]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[CONVERT_LAYOUT_31:.*]] = triton_gpu.convert_layout %{{.*}} -// CHECK: %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[CONVERT_LAYOUT_31]], %[[ARG7]] -// CHECK: %[[ADDI_33:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_34:.*]] = arith.cmpi slt, %[[ADDI_33]], %{{.*}} -// CHECK: %[[SELECT_35:.*]] = arith.select %[[CMPI_34]], %[[ADDI_33]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_36:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_35]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_26]], %[[MEMDESC_SUBVIEW_36]] -// CHECK: scf.yield %[[DOT_32]], %[[SELECT_29]], %[[SELECT_35]], %[[IF_25]], %[[MEMDESC_SUBVIEW_36]] -// CHECK: } - - tt.func @matmul_nested_ops(%arg0: index, %arg1: index, %arg2: index, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: index) -> tensor<128x128xf32, #mma> { - %c1_i32 = arith.constant 1 : i32 - %0 = arith.cmpi slt, %arg0, %arg1 : index - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %3 = tt.broadcast %2 : tensor<1x32xi32, #blocked> -> tensor<128x32xi32, #blocked> - %4 = tt.splat %arg3 : !tt.ptr -> tensor<128x32x!tt.ptr, #blocked> - %5 = tt.addptr %4, %3 : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> - %cst = arith.constant dense<4> : tensor<128x32xi32, #blocked> - %6 = arith.cmpi slt, %arg0, %arg5 : index - %7 = tt.splat %0 : i1 -> tensor<128x32xi1, #blocked> - %8 = scf.if %6 -> (tensor<128x32x!tt.ptr, #blocked>) { - %19 = tt.addptr %5, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> - scf.yield %19 : tensor<128x32x!tt.ptr, #blocked> - } else { - scf.yield %5 : tensor<128x32x!tt.ptr, #blocked> - } - %9 = tt.load %8, %7 : tensor<128x32x!tt.ptr, #blocked> - %10 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %11 = tt.expand_dims %10 {axis = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1> - %12 = tt.broadcast %11 : tensor<1x128xi32, #blocked1> -> tensor<32x128xi32, #blocked1> - %13 = tt.splat %arg4 : !tt.ptr -> tensor<32x128x!tt.ptr, #blocked1> - %14 = tt.addptr %13, %12 : tensor<32x128x!tt.ptr, #blocked1>, tensor<32x128xi32, #blocked1> - %15 = tt.load %14 : tensor<32x128x!tt.ptr, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %c-1_i32 = arith.constant -1 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %16 = triton_gpu.local_alloc : () -> !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %17 = triton_gpu.memdesc_subview %16[%c0_i32, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %9, %17 : tensor<128x32xf16, #blocked> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - %18:5 = scf.for %arg6 = %arg0 to %arg1 step %arg2 iter_args(%arg7 = %cst_0, %arg8 = %c-1_i32, %arg9 = %c0_i32, %arg10 = %8, %arg11 = %17) -> (tensor<128x128xf32, #mma>, i32, i32, tensor<128x32x!tt.ptr, #blocked>, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable>) { - %19 = arith.subi %arg1, %arg2 : index - %20 = arith.cmpi slt, %arg6, %19 : index - %21 = arith.addi %arg6, %arg2 : index - %22 = tt.addptr %arg10, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> - %23 = arith.cmpi slt, %21, %arg5 : index - %24 = tt.splat %20 : i1 -> tensor<128x32xi1, #blocked> - %25 = scf.if %23 -> (tensor<128x32x!tt.ptr, #blocked>) { - %37 = tt.addptr %22, %cst : tensor<128x32x!tt.ptr, #blocked>, tensor<128x32xi32, #blocked> - scf.yield %37 : tensor<128x32x!tt.ptr, #blocked> - } else { - scf.yield %22 : tensor<128x32x!tt.ptr, #blocked> - } - %26 = tt.load %25, %24 : tensor<128x32x!tt.ptr, #blocked> - %27 = arith.addi %arg8, %c1_i32 : i32 - %28 = arith.cmpi slt, %27, %c1_i32 : i32 - %29 = arith.select %28, %27, %c0_i32 : i32 - %30 = triton_gpu.local_load %arg11 : !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %31 = triton_gpu.convert_layout %15 : tensor<32x128xf16, #blocked1> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %32 = tt.dot %30, %31, %arg7 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma> - %33 = arith.addi %arg9, %c1_i32 : i32 - %34 = arith.cmpi slt, %33, %c1_i32 : i32 - %35 = arith.select %34, %33, %c0_i32 : i32 - %36 = triton_gpu.memdesc_subview %16[%35, %c0_i32, %c0_i32] : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - triton_gpu.local_store %26, %36 : tensor<128x32xf16, #blocked> -> !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - scf.yield %32, %29, %35, %25, %36 : tensor<128x128xf32, #mma>, i32, i32, tensor<128x32x!tt.ptr, #blocked>, !tt.memdesc<128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - } - triton_gpu.local_dealloc %16 : !tt.memdesc<1x128x32xf16, #shared, #triton_gpu.shared_memory, mutable> - tt.return %18#0 : tensor<128x128xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @dot_prologue_epilogue -// CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[IF_14:.*]] = scf.if %[[CMPI_13]] -> (tensor<64x16x!tt.ptr, #blocked>) { - -// CHECK: %[[ADDPTR_30:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: scf.yield %[[ADDPTR_30]] -// CHECK: } else { - -// CHECK: scf.yield %[[ARG6]] -// CHECK: } - -// CHECK: %[[LOAD_15:.*]] = tt.load %[[IF_14]] -// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[ADDPTR_17:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// CHECK: %[[LOAD_18:.*]] = tt.load %[[ADDPTR_17]], %[[SPLAT_16]] -// CHECK: %[[LOCAL_ALLOC_19:.*]] = triton_gpu.local_alloc %[[LOAD_15]] -// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} -// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} -// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} -// CHECK: %[[LOCAL_ALLOC_26:.*]] = triton_gpu.local_alloc %[[ARG10]] -// CHECK: %[[WARP_GROUP_DOT_27:.*]] = triton_nvidia_gpu.warp_group_dot %[[LOCAL_ALLOC_26]], %[[LOCAL_ALLOC_19]], %[[ARG5]] -// CHECK: %[[ADDPTR_28:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[IF_29:.*]] = scf.if %[[CMPI_13]] -> (tensor<128x16xf32, #mma>) { - -// CHECK: %[[MULF_30:.*]] = arith.mulf %[[WARP_GROUP_DOT_27]], %{{.*}} -// CHECK: scf.yield %[[MULF_30]] -// CHECK: } else { - -// CHECK: scf.yield %[[WARP_GROUP_DOT_27]] -// CHECK: } - -// CHECK: scf.yield %[[IF_29]], %[[ADDPTR_28]], %[[ADDPTR_17]], %[[SELECT_22]], %[[SELECT_25]], %[[LOAD_18]] -// CHECK: } - - tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma> { - %c7_i32 = arith.constant 7 : i32 - %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %2 = tt.broadcast %1 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %3 = tt.splat %arg1 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked1> - %4 = tt.addptr %3, %2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %5 = tt.load %4 : tensor<128x64x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %8 = tt.splat %arg0 : !tt.ptr -> tensor<64x16x!tt.ptr, #blocked> - %9 = tt.broadcast %7 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %10 = tt.addptr %8, %9 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %11:6 = scf.for %arg4 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg5 = %cst_1, %arg6 = %10, %arg7 = %4, %arg8 = %c-1_i32, %arg9 = %c-1_i32, %arg10 = %5) -> (tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1>) : i32 { - %12 = arith.cmpi slt, %arg4, %c7_i32 : i32 - %13 = tt.splat %12 : i1 -> tensor<128x64xi1, #blocked1> - %14 = tt.addptr %arg7, %cst_0 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %15 = tt.load %14, %13 : tensor<128x64x!tt.ptr, #blocked1> - %16 = arith.cmpi slt, %arg4, %arg2 : i32 - %17 = scf.if %16 -> (tensor<64x16x!tt.ptr, #blocked>) { - %30 = tt.addptr %arg6, %arg3 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - scf.yield %30 : tensor<64x16x!tt.ptr, #blocked> - } else { - scf.yield %arg6 : tensor<64x16x!tt.ptr, #blocked> - } - %18 = tt.load %17 : tensor<64x16x!tt.ptr, #blocked> - %19 = arith.addi %arg8, %c1_i32 : i32 - %20 = arith.cmpi slt, %19, %c1_i32 : i32 - %21 = arith.select %20, %19, %c0_i32 : i32 - %22 = arith.addi %arg9, %c1_i32 : i32 - %23 = arith.cmpi slt, %22, %c1_i32 : i32 - %24 = arith.select %23, %22, %c0_i32 : i32 - %25 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %26 = triton_gpu.local_alloc %arg10 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> - %27 = triton_nvidia_gpu.warp_group_dot %26, %25, %arg5 : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> - %28 = tt.addptr %arg6, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %29 = scf.if %16 -> (tensor<128x16xf32, #mma>) { - %30 = arith.mulf %27, %cst_1 : tensor<128x16xf32, #mma> - scf.yield %30 : tensor<128x16xf32, #mma> - } else { - scf.yield %27 : tensor<128x16xf32, #mma> - } - scf.yield %29, %28, %14, %21, %24, %15 : tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1> - } - tt.return %11#0 : tensor<128x16xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func @pipeline_downstream_dependencies -// CHECK: %{{.*}}:6 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}, %[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[LOAD_13:.*]] = tt.load %[[ARG6]] -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[ADDPTR_15:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_15]], %[[SPLAT_14]] -// CHECK: %[[LOCAL_ALLOC_17:.*]] = triton_gpu.local_alloc %[[LOAD_13]] -// CHECK: %[[ADDI_18:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ADDI_18]], %{{.*}} -// CHECK: %[[SELECT_20:.*]] = arith.select %[[CMPI_19]], %[[ADDI_18]], %{{.*}} -// CHECK: %[[ADDI_21:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ADDI_21]], %{{.*}} -// CHECK: %[[SELECT_23:.*]] = arith.select %[[CMPI_22]], %[[ADDI_21]], %{{.*}} -// CHECK: %[[LOCAL_ALLOC_24:.*]] = triton_gpu.local_alloc %[[ARG10]] -// CHECK: %[[WARP_GROUP_DOT_25:.*]] = triton_nvidia_gpu.warp_group_dot %[[LOCAL_ALLOC_24]], %[[LOCAL_ALLOC_17]], %[[ARG5]] -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[SELECT_27:.*]] = arith.select %[[CMPI_26]], %{{.*}}, %{{.*}} -// CHECK: %[[IF_28:.*]] = scf.if %[[CMPI_26]] -> (tensor<128x16xf32, #mma>) { - -// CHECK: %[[MULF_30:.*]] = arith.mulf %[[WARP_GROUP_DOT_25]], %{{.*}} -// CHECK: scf.yield %[[MULF_30]] -// CHECK: } else { - -// CHECK: scf.yield %[[WARP_GROUP_DOT_25]] -// CHECK: } - -// CHECK: %[[ADDPTR_29:.*]] = tt.addptr %[[ARG6]], %[[SELECT_27]] -// CHECK: scf.yield %[[IF_28]], %[[ADDPTR_29]], %[[ADDPTR_15]], %[[SELECT_20]], %[[SELECT_23]], %[[LOAD_16]] -// CHECK: } - - tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma> { - %c7_i32 = arith.constant 7 : i32 - %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %1 = tt.expand_dims %0 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %2 = tt.broadcast %1 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %3 = tt.splat %arg1 : !tt.ptr -> tensor<128x64x!tt.ptr, #blocked1> - %4 = tt.addptr %3, %2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %5 = tt.load %4 : tensor<128x64x!tt.ptr, #blocked1> - %c-1_i32 = arith.constant -1 : i32 - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst_0 = arith.constant dense<1> : tensor<64x16xi32, #blocked> - %cst_1 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %6 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %8 = tt.splat %arg0 : !tt.ptr -> tensor<64x16x!tt.ptr, #blocked> - %9 = tt.broadcast %7 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %10 = tt.addptr %8, %9 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %11:6 = scf.for %arg4 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg5 = %cst_2, %arg6 = %10, %arg7 = %4, %arg8 = %c-1_i32, %arg9 = %c-1_i32, %arg10 = %5) -> (tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1>) : i32 { - %12 = arith.cmpi slt, %arg4, %c7_i32 : i32 - %13 = tt.splat %12 : i1 -> tensor<128x64xi1, #blocked1> - %14 = tt.addptr %arg7, %cst_1 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %15 = tt.load %14, %13 : tensor<128x64x!tt.ptr, #blocked1> - %16 = tt.load %arg6 : tensor<64x16x!tt.ptr, #blocked> - %17 = arith.addi %arg8, %c1_i32 : i32 - %18 = arith.cmpi slt, %17, %c1_i32 : i32 - %19 = arith.select %18, %17, %c0_i32 : i32 - %20 = arith.addi %arg9, %c1_i32 : i32 - %21 = arith.cmpi slt, %20, %c1_i32 : i32 - %22 = arith.select %21, %20, %c0_i32 : i32 - %23 = triton_gpu.local_alloc %16 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %24 = triton_gpu.local_alloc %arg10 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> - %25 = triton_nvidia_gpu.warp_group_dot %24, %23, %arg5 : !tt.memdesc<128x64xf16, #shared1, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma> - %26 = arith.cmpi slt, %arg4, %arg2 : i32 - %27 = arith.select %26, %cst, %cst_0 : tensor<64x16xi32, #blocked> - %28 = scf.if %26 -> (tensor<128x16xf32, #mma>) { - %30 = arith.mulf %25, %cst_2 : tensor<128x16xf32, #mma> - scf.yield %30 : tensor<128x16xf32, #mma> - } else { - scf.yield %25 : tensor<128x16xf32, #mma> - } - %29 = tt.addptr %arg6, %27 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - scf.yield %28, %29, %14, %19, %22, %15 : tensor<128x16xf32, #mma>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>, i32, i32, tensor<128x64xf16, #blocked1> - } - tt.return %11#0 : tensor<128x16xf32, #mma> - } -} - -// ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "hip:gfx942", "triton_gpu.threads-per-warp" = 64 : i32} { - -// CHECK-LABEL: tt.func public @masked_add_kernel -// CHECK: %{{.*}}:10 = scf.for %[[ARG4:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG5:.*]] = %{{.*}}-1_i32, %[[ARG6:.*]] = %{{.*}}-1_i32, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG4]], %{{.*}} -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG4]], %{{.*}} -// CHECK: %[[ADDI_25:.*]] = arith.addi %{{.*}}, %[[ADDI_24]] -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[ADDI_25]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[SPLAT_26]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SPLAT_29:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[ANDI_30:.*]] = arith.andi %[[SPLAT_29]], %[[CMPI_28]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] -// CHECK: %[[LOAD_32:.*]] = tt.load %[[ADDPTR_31]], %[[ANDI_30]], %{{.*}} -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[ANDI_34:.*]] = arith.andi %[[SPLAT_33]], %[[CMPI_28]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %{{.*}}, %[[ADDI_27]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_35]], %[[ANDI_34]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG5]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[ADDI_40:.*]] = arith.addi %[[ARG6]], %{{.*}} -// CHECK: %[[CMPI_41:.*]] = arith.cmpi slt, %[[ADDI_40]], %{{.*}} -// CHECK: %[[SELECT_42:.*]] = arith.select %[[CMPI_41]], %[[ADDI_40]], %{{.*}} -// CHECK: %[[ADDF_43:.*]] = arith.addf %[[ARG7]], %[[ARG9]] -// CHECK: %[[ADDPTR_44:.*]] = tt.addptr %{{.*}}, %[[ARG11]] -// CHECK: tt.store %[[ADDPTR_44]], %[[ADDF_43]], %[[ARG13]] -// CHECK: scf.yield %[[SELECT_39]], %[[SELECT_42]], %[[ARG8]], %[[LOAD_32]], %[[ARG10]], %[[LOAD_36]], %[[ARG12]], %[[ADDI_27]], %[[ARG14]], %[[CMPI_28]] -// CHECK: } - - tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c2048_i32 = arith.constant 2048 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %0 = tt.get_program_id x : i32 - %c1024_i32 = arith.constant 1024 : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = arith.addi %1, %c1024_i32 : i32 - %3 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %4 = tt.splat %2 : i32 -> tensor<1024xi32, #blocked> - %5 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %6 = arith.addi %4, %3 : tensor<1024xi32, #blocked> - %7 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> - %8 = arith.cmpi slt, %6, %5 : tensor<1024xi32, #blocked> - %9 = tt.addptr %7, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %10 = tt.load %9, %8, %cst : tensor<1024x!tt.ptr, #blocked> - %11 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %12 = tt.addptr %11, %6 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %13 = tt.load %12, %8, %cst : tensor<1024x!tt.ptr, #blocked> - %14 = tt.splat %1 : i32 -> tensor<1024xi32, #blocked> - %15 = arith.addi %14, %3 : tensor<1024xi32, #blocked> - %16 = arith.cmpi slt, %15, %5 : tensor<1024xi32, #blocked> - %17 = tt.addptr %7, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %18 = tt.load %17, %16, %cst : tensor<1024x!tt.ptr, #blocked> - %19 = tt.addptr %11, %15 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %20 = tt.load %19, %16, %cst : tensor<1024x!tt.ptr, #blocked> - %c1014752_i32 = arith.constant 1014752 : i32 - %c2_i32 = arith.constant 2 : i32 - %c1_i32 = arith.constant 1 : i32 - %c-1_i32 = arith.constant -1 : i32 - %c0_i32 = arith.constant 0 : i32 - %21 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %22:10 = scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 iter_args(%arg5 = %c-1_i32, %arg6 = %c-1_i32, %arg7 = %20, %arg8 = %13, %arg9 = %18, %arg10 = %10, %arg11 = %15, %arg12 = %6, %arg13 = %16, %arg14 = %8) -> (i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked>) : i32 { - %23 = arith.cmpi slt, %arg4, %c1014752_i32 : i32 - %24 = arith.addi %arg4, %c2048_i32 : i32 - %25 = arith.addi %1, %24 : i32 - %26 = tt.splat %25 : i32 -> tensor<1024xi32, #blocked> - %27 = arith.addi %26, %3 : tensor<1024xi32, #blocked> - %28 = arith.cmpi slt, %27, %5 : tensor<1024xi32, #blocked> - %29 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> - %30 = arith.andi %29, %28 : tensor<1024xi1, #blocked> - %31 = tt.addptr %7, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %32 = tt.load %31, %30, %cst : tensor<1024x!tt.ptr, #blocked> - %33 = tt.splat %23 : i1 -> tensor<1024xi1, #blocked> - %34 = arith.andi %33, %28 : tensor<1024xi1, #blocked> - %35 = tt.addptr %11, %27 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %36 = tt.load %35, %34, %cst : tensor<1024x!tt.ptr, #blocked> - %37 = arith.addi %arg5, %c1_i32 : i32 - %38 = arith.cmpi slt, %37, %c2_i32 : i32 - %39 = arith.select %38, %37, %c0_i32 : i32 - %40 = arith.addi %arg6, %c1_i32 : i32 - %41 = arith.cmpi slt, %40, %c2_i32 : i32 - %42 = arith.select %41, %40, %c0_i32 : i32 - %43 = arith.addf %arg7, %arg9 : tensor<1024xf32, #blocked> - %44 = tt.addptr %21, %arg11 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %44, %43, %arg13 : tensor<1024x!tt.ptr, #blocked> - scf.yield %39, %42, %arg8, %36, %arg10, %32, %arg12, %27, %arg14, %28 : i32, i32, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xf32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi32, #blocked>, tensor<1024xi1, #blocked>, tensor<1024xi1, #blocked> - } - tt.return - } -} diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py index 9f2d0fd91fe4..3227a5535842 100644 --- a/third_party/amd/backend/compiler.py +++ b/third_party/amd/backend/compiler.py @@ -149,19 +149,13 @@ def make_ttgir(mod, metadata, options): passes.ttgpuir.add_remove_layout_conversions(pm) amd.passes.ttgpuir.add_optimize_epilogue(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) - use_new_pipeliner = os.getenv("TRITON_HIP_USE_NEW_STREAM_PIPELINE", "0") == "1" - if amd.has_matrix_core_feature(options.arch): - if use_new_pipeliner: - num_stages = options.num_stages if options.num_stages != 0 else 2 - amd.passes.ttgpuir.add_stream_pipelinev2(pm, num_stages) - else: - if options.num_stages == 0: - amd.passes.ttgpuir.add_stream_pipeline(pm) + if options.num_stages == 0 and amd.has_matrix_core_feature(options.arch): + amd.passes.ttgpuir.add_stream_pipeline(pm) passes.common.add_canonicalizer(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) - if use_new_pipeliner or options.num_stages != 0: + if options.num_stages != 0: amd.passes.ttgpuir.add_reorder_instructions(pm) passes.common.add_cse(pm) passes.common.add_symbol_dce(pm) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp index 6de4d455a60b..f9fac1bf5b0d 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/ReorderInstructions.cpp @@ -21,88 +21,19 @@ #define GEN_PASS_CLASSES #include "TritonAMDGPUTransforms/Passes.h" -#include - using namespace mlir; static bool willIncreaseRegisterPressure(Operation *op) { if (isa(op)) return true; - if (auto cvt = dyn_cast(op)) - return isa( - cvt.getType().getEncoding()); + auto cvt = dyn_cast(op); + if (!cvt) + return false; + if (isa(cvt.getType().getEncoding())) + return true; return false; } -// Gather cone of DFG from the op's basic block. -// - Collect dfg breadth first to keep relative order and -// reverse order for insertion after. An op may be captured -// multiple times if DFG reconverges and it will be moved multiple -// times to keep dominance correctness. -// - Returns bool if this DFG leads to a load op. This -// condition is not desirable for moving ttg.local_stores -// early. -static bool gatherDFG(Operation *op, Block *block, - SmallVector &dfg) { - bool leadsToLoad = false; - - std::list oprs{op}; - auto checkOperands = [&](Operation *cop) { - for (auto operand : cop->getOperands()) { - if (Operation *oprOp = operand.getDefiningOp()) { - Block *oprBlk = oprOp->getBlock(); - if (block->findAncestorOpInBlock(*oprOp)) { - // only move ops that reside in same block - if (oprBlk == block) - dfg.push_back(oprOp); - oprs.push_back(oprOp); - leadsToLoad |= isa(oprOp); - } else { - // should always be in parent block - assert(oprBlk->findAncestorOpInBlock(*block->getParentOp())); - } - } - } - }; - - // BFS (filo) - while (oprs.size()) { - Operation *nop = oprs.front(); - oprs.pop_front(); - // check next op and sub-regions - nop->walk(checkOperands); - } - return leadsToLoad; -} - -// Search thru block to find earliest insertion point for move -// op. This can be either an atomic op or last usage of source pointer. -// Search ends when move op encountered. -static llvm::ilist::iterator -findEarlyInsertionPoint(Block *block, Operation *move, Value src) { - auto loc = block->begin(); - for (auto bi = block->begin(); bi != block->end(); ++bi) { - auto *op = &*bi; - if (op == move) // don't move later than current location - break; - if (src) { - // check for ops accessing src - for (auto opr : op->getOperands()) { - if (opr == src) - loc = bi; - } - } - // atomics used for syncronization? - op->walk([&](Operation *wop) { - if (isa(wop)) - loc = bi; - if (isa(wop)) - loc = bi; - }); - } - return loc; -} - class TritonAMDGPUReorderInstructionsPass : public TritonAMDGPUReorderInstructionsBase< TritonAMDGPUReorderInstructionsPass> { @@ -121,60 +52,36 @@ class TritonAMDGPUReorderInstructionsPass m.walk([&](Operation *op) { if (!willIncreaseRegisterPressure(op)) return; - if (!op->hasOneUse()) + auto user_begin = op->user_begin(); + auto user_end = op->user_end(); + if (std::distance(user_begin, user_end) != 1) return; - Operation *user = op->getUses().begin()->getOwner(); - if (user->getParentOfType() == + if (user_begin->getParentOfType() == op->getParentOfType()) return; - opToMove.insert({op, user}); + opToMove.insert({op, *user_begin}); }); for (auto &kv : opToMove) kv.first->moveBefore(kv.second); - opToMove.clear(); // Move LocalLoadOp and LocalAllocOp immediately after their operands. m.walk([&](Operation *op) { - if (!isa(op) || - op->getNumOperands() < 1) { + if (!isa(op)) { return; } - if (Operation *argOp = op->getOperand(0).getDefiningOp()) - moveAfter(op, argOp); + Operation *argOp = op->getOperand(0).getDefiningOp(); + if (!argOp) + return; + moveAfter(op, argOp); }); // Move transpositions just after their definition + opToMove.clear(); m.walk([&](triton::TransOp op) { Operation *argOp = op.getSrc().getDefiningOp(); if (!argOp) return; moveAfter(op, argOp); }); - SmallVector moveOps; - // Move global loads early to prefetch. - m.walk([&](triton::LoadOp op) { moveOps.push_back(op); }); - // Move local_stores early if dependence distance greater than - // one iteration. Best perf on GEMM when these precede global loads. - m.walk([&](triton::gpu::LocalStoreOp op) { moveOps.push_back(op); }); - - for (auto op : moveOps) { - // Gather use-def chain in block. - Block *block = op->getBlock(); - SmallVector dfg{op}; - bool leadsToLoad = gatherDFG(op, block, dfg); - if (!isa(op) || !leadsToLoad) { - Value src; - if (auto ld = dyn_cast(op)) - src = ld.getPtr(); - auto ip = findEarlyInsertionPoint(block, op, src); - // Remove ops that already precede the insertion point. This - // is done before moves happen to avoid N^2 complexity in - // `Operation::isBeforeInBlock`. - llvm::erase_if(dfg, - [&](Operation *op) { return !ip->isBeforeInBlock(op); }); - // Move ops to insertion point. - for (auto *op : dfg) - op->moveAfter(block, ip); - } - } + return; } }; From 3353b7db082364d4bda15e8fdf8ec95c67c493a8 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 24 Jul 2024 15:38:05 +0000 Subject: [PATCH 18/36] [TEST] Drop irrelevant NVIDIA specific attributes Software pipeling should be not using them. This makes it cleaner and prepares reusing the same test inputs for AMD side. --- test/TritonGPU/loop-pipeline.mlir | 36 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index a5594b304309..80444b152616 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -55,7 +55,7 @@ // CHECK-DAG: %[[NEXT_B:.*]] = triton_gpu.memdesc_subview %{{.+}}[%[[EXT_IDX_3]], // CHECK-DAG: triton_gpu.async_wait {{.*}} {num = 2 : i32} // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_A]], %[[NEXT_B]] -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32} { tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -582,7 +582,7 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: tt.func @load_two_users tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> @@ -643,7 +643,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: tt.func @load_two_users_incompatible_layouts tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> @@ -728,7 +728,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // would be pipelined. #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> @@ -790,7 +790,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> %c64_i32 = arith.constant 64 : i32 @@ -903,7 +903,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -948,7 +948,7 @@ tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibilit // CHECK: tt.return #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> @@ -1003,7 +1003,7 @@ module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : // CHECK: triton_gpu.async_copy_global_to_local {{.*}}, %[[B1BUFFER]] // CHECK: scf.for #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %c1024_i32 = arith.constant 1024 : i32 %c0_i32 = arith.constant 0 : i32 @@ -1072,7 +1072,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} { tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> %c1_i32 = arith.constant 1 : i32 @@ -1116,7 +1116,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #blocked4 = #triton_gpu.blocked<{sizePerThread = [16, 2, 1], threadsPerWarp = [4, 1, 8], warpsPerCTA = [1, 1, 8], order = [1, 0, 2]}> #blocked5 = #triton_gpu.blocked<{sizePerThread = [32, 1], threadsPerWarp = [4, 8], warpsPerCTA = [1, 8], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32} { tt.func public @int4_matmul_ampere( %arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32} @@ -1191,7 +1191,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -1235,7 +1235,7 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // CHECK-LABEL: @matmul_indirect_pipeline #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} { tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> %c1_i32 = arith.constant 1 : i32 @@ -1279,7 +1279,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK-NOT: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> %c128_i32 = arith.constant 128 : i32 @@ -1330,7 +1330,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "cuda:80"} { +module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32} { tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}, @@ -1388,7 +1388,7 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, #mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: dot_prologue_epilogue // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { @@ -1460,7 +1460,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : #mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-NOCANON-LABEL: pipeline_downstream_dependencies // CHECK-NOCANON: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { @@ -1528,7 +1528,7 @@ module attributes {"triton_gpu.target" = "cuda:80", "triton_gpu.num-ctas" = 1 : // CHECK: arith.select {{.*}}, %[[B]], %[[CONSTANT]] #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { %c1024_i32 = arith.constant 1024 : i32 %c0_i32 = arith.constant 0 : i32 @@ -1565,7 +1565,7 @@ module attributes {"triton_gpu.target" = "cuda:90", "triton_gpu.num-ctas" = 1 : #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 256, 16]}> #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: @matmul_tma // CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> // CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> From c82defc6ddc77ab83a7e0a985641eca6ccbaa807 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 24 Jul 2024 19:09:30 +0000 Subject: [PATCH 19/36] Drop unused chained load logic --- .../StreamPipelineV2.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index bbeeaf837712..543895ec3cec 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -73,24 +73,6 @@ static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, Value src = loadOp.getPtr(); Value mask = loadOp.getMask(); Value other = loadOp.getOther(); - if (!isExpensiveLoadOrStore(loadOp) && loadToInfo[loadOp].blockedEncoding) { - // For inexpensive loads that do not directly feed into dot ops - // we want to use optimal layout for the data. - ttg::BlockedEncodingAttr encoding = loadToInfo[loadOp].blockedEncoding; - auto convertBlockLayout = [&](Value src) { - auto ty = cast(src.getType()); - auto newTy = - RankedTensorType::get(ty.getShape(), ty.getElementType(), encoding); - auto cvt = - builder.create(loadOp->getLoc(), newTy, src); - return cvt.getResult(); - }; - src = convertBlockLayout(src); - if (mask) - mask = convertBlockLayout(mask); - if (other) - other = convertBlockLayout(other); - } tt::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); From 9c91b316506aaedf7559c780bd18a313ecc3dc9b Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 24 Jul 2024 19:10:12 +0000 Subject: [PATCH 20/36] Add debug print --- third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 543895ec3cec..ef3fadfc7f80 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -710,6 +710,7 @@ static bool pipelineLoop(scf::ForOp forOp, int numStages) { if (failed(newForOp)) return false; + LDBG("Loop before expander\n" << *newForOp); return true; } From 181e37d52b2cbdf10068119b4ab24a9ea0ff79d3 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 24 Jul 2024 22:57:23 +0000 Subject: [PATCH 21/36] Drop uncessary canonicalization and cleanup some tests --- test/TritonGPU/amd/amd-stream-pipeline.mlir | 139 +++--------------- .../StreamPipelineV2.cpp | 13 -- 2 files changed, 20 insertions(+), 132 deletions(-) diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index 5a08e92168d0..3f09d2fa8123 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 | FileCheck %s +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 @@ -31,11 +31,9 @@ // CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} // CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} // CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[CONVERT_LAYOUT_26:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_25]] // CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[CONVERT_LAYOUT_28:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[MULF_29:.*]] = arith.mulf %[[CONVERT_LAYOUT_28]], %{{.*}} -// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_26]], %[[MULF_29]], %[[ARG8]] +// CHECK: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} +// CHECK: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] // CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} // CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} // CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] @@ -101,6 +99,7 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, } // CHECK-LABEL: tt.func @matmul_loop_nested +// CHECK: scf.for // CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc // CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc // CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} @@ -120,10 +119,8 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, // CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} // CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} // CHECK: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_27:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_26]] // CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_30:.*]] = tt.dot %[[CONVERT_LAYOUT_27]], %[[CONVERT_LAYOUT_29]], %[[ARG10]] +// CHECK: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] // CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} // CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} // CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] @@ -189,7 +186,7 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, } // CHECK-LABEL: tt.func @matmul_loop_single_pipeline -// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}}, %{{.*}}, %{{.*}} +// CHECK: %[[LOAD_10:.*]] = tt.load // CHECK: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] // CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc // CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} @@ -204,8 +201,7 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, // CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} // CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} // CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] -// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] -// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[CONVERT_LAYOUT_24]], %[[ARG7]] +// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] // CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} // CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] // CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} @@ -291,9 +287,7 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} // CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] // CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_32:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_30]] -// CHECK: %[[CONVERT_LAYOUT_33:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_31]] -// CHECK: %[[DOT_34:.*]] = tt.dot %[[CONVERT_LAYOUT_32]], %[[CONVERT_LAYOUT_33]], %[[ARG7]] +// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] // CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} // CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} // CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] @@ -374,9 +368,7 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, // CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} // CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] // CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[CONVERT_LAYOUT_24:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] -// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_23]] -// CHECK: %[[DOT_26:.*]] = tt.dot %[[CONVERT_LAYOUT_24]], %[[CONVERT_LAYOUT_25]], %[[ARG7]] +// CHECK: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] // CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} // CHECK: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] // CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] @@ -463,9 +455,7 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, // CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} // CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] // CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] // CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} // CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} // CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] @@ -1016,9 +1006,7 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} // CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] // CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[CONVERT_LAYOUT_29:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_27]] -// CHECK: %[[CONVERT_LAYOUT_30:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_28]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[CONVERT_LAYOUT_29]], %[[CONVERT_LAYOUT_30]], %[[ARG7]] +// CHECK: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] // CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} // CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} // CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] @@ -1181,35 +1169,15 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- // CHECK-LABEL: tt.func public @nested_loops -// CHECK: %[[LOAD_10:.*]] = tt.load %{{.*}} -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc %[[LOAD_10]] -// CHECK: %[[TRANS_12:.*]] = tt.trans %[[LOCAL_ALLOC_11]] {order = array} -// CHECK: %[[LOCAL_LOAD_13:.*]] = triton_gpu.local_load %[[TRANS_12]] -// CHECK: %[[LOCAL_ALLOC_14:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// CHECK: %{{.*}}:3 = scf.for %[[ARG2:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG3:.*]] = %{{.*}}-1_i32, %[[ARG4:.*]] = %{{.*}}, %[[ARG5:.*]] = %[[MEMDESC_SUBVIEW_16]]) - -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG2]], %{{.*}} -// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG3]], %{{.*}} -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG5]] -// CHECK: %[[CONVERT_LAYOUT_23:.*]] = triton_gpu.convert_layout %[[LOCAL_LOAD_22]] -// CHECK: %[[DOT_24:.*]] = tt.dot %[[CONVERT_LAYOUT_23]], %[[LOCAL_LOAD_13]], %{{.*}} -// CHECK: %[[CONVERT_LAYOUT_25:.*]] = triton_gpu.convert_layout %[[DOT_24]] -// CHECK: tt.store %{{.*}}, %[[CONVERT_LAYOUT_25]] -// CHECK: %[[SPLAT_26:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_27:.*]] = tt.load %{{.*}}, %[[SPLAT_26]] -// CHECK: %[[ADDI_28:.*]] = arith.addi %[[ARG4]], %{{.*}} -// CHECK: %[[CMPI_29:.*]] = arith.cmpi slt, %[[ADDI_28]], %{{.*}} -// CHECK: %[[SELECT_30:.*]] = arith.select %[[CMPI_29]], %[[ADDI_28]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_31:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_14]][%[[SELECT_30]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_27]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: scf.yield %[[SELECT_21]], %[[SELECT_30]], %[[MEMDESC_SUBVIEW_31]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_14]] +// CHECK-NOT: triton_gpu.local_alloc +// CHECK: scf.for +// CHECK: triton_gpu.local_alloc +// CHECK: scf.for +// CHECK: triton_gpu.local_load +// CHECK: tt.dot +// CHECK: triton_gpu.local_store +// CHECK: scf.yield +// CHECK: triton_gpu.local_dealloc #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> @@ -1525,73 +1493,6 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// Verify that uses of the ops scheduled in partucular place of the loop (like epilogue if) are correctly scheduled too. -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - // CHECK-LABEL: pipeline_downstream_dependencies - // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} - tt.func @pipeline_downstream_dependencies(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst1 = arith.constant dense<1> : tensor<64x16xi32, #blocked> - %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] to - // CHECK: load - // CHECK-NOT: load - // CHECK: dot - // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // CHECK: %[[IFRET:.*]]:2 = scf.if %[[CND]] - // CHECK: arith.mulf - // CHECK: scf.yield - // CHECK: tt.addptr {{.*}}, %[[IFRET]]#1 - // CHECK: scf.yield - %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { - %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> - %18 = tt.load %arg5 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> - %cnd = arith.cmpi slt, %arg3, %ext : i32 - %if_ret:2 = scf.if %cnd -> (tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked>) { - %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> - scf.yield %acc_zero, %cst : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> - } else { - scf.yield %acc, %cst1 : tensor<128x16xf32, #mma1>, tensor<64x16xi32, #blocked> - } - %22 = tt.addptr %arg5, %if_ret#1 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %if_ret#0, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> - } - tt.return %17#0 : tensor<128x16xf32, #mma1> - } -} - -// ----- - // CHECK-LABEL: @masked_add_kernel // CHECK: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> // CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index ef3fadfc7f80..4c51ff989056 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -1,7 +1,6 @@ #include "TritonAMDGPUTransforms/Passes.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Support/LLVM.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "triton/Analysis/AxisInfo.h" #include "triton/Analysis/Utility.h" #include "triton/Dialect/TritonGPU/IR/Dialect.h" @@ -745,18 +744,6 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineV2Base { int loopNumStages = getNumStagesOrDefault(forOp); pipelined |= pipelineLoop(forOp, loopNumStages); } - - if (pipelined) { - // Clean up arithmetic before applying the next level of pipelining to - // simplify the IR. - auto arithDialect = - getOperation().getContext()->getLoadedDialect(); - RewritePatternSet patterns(getOperation().getContext()); - arithDialect->getCanonicalizationPatterns(patterns); - if (applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)) - .failed()) - signalPassFailure(); - } } }; } // anonymous namespace From fb694d1b443224dc992da8a035f70105ae7fb5e9 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 25 Jul 2024 04:51:35 +0000 Subject: [PATCH 22/36] Various improvements --- .../StreamPipelineV2.cpp | 326 ++++++++---------- 1 file changed, 151 insertions(+), 175 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 4c51ff989056..8f167f133def 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -27,8 +27,6 @@ #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ") #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") -#define int_attr(num) builder.getI64IntegerAttr(num) - using namespace mlir; namespace tt = mlir::triton; namespace ttg = mlir::triton::gpu; @@ -36,17 +34,18 @@ namespace ttg = mlir::triton::gpu; namespace { struct LoadInfo { - // Layout of the data in the shared memory. + // Shared layout is used for loads feeding into dot ops. ttg::SharedEncodingAttr sharedEncoding = nullptr; - // Blocked encoding is used for loads not used by the dot. + // Blocked layout is used for loads not feeding into dot ops. ttg::BlockedEncodingAttr blockedEncoding = nullptr; + // The distance of this load's stage to its use' stage. int distToUse = 0; bool usedByDot = false; }; } // namespace -// Replace the ForOp's yield with a new one with the given operands appended. +// Replace the forOp's yield with a new one with the given operands appended. static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { // Fix up the yield op. Operation *yieldOp = forOp.getBody()->getTerminator(); @@ -71,7 +70,6 @@ static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, Location loc = loadOp.getLoc(); Value src = loadOp.getPtr(); Value mask = loadOp.getMask(); - Value other = loadOp.getOther(); tt::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); @@ -85,47 +83,44 @@ static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, // Extract part. SmallVector loadOffsets(allocTy.getRank(), zero); loadOffsets[0] = extractIdx; - Attribute sharedMemorySpace = + auto sharedMemorySpace = triton::gpu::SharedMemorySpaceAttr::get(forOp.getContext()); - tt::MemDescType subviewTy = tt::MemDescType::get( + auto subviewTy = tt::MemDescType::get( allocTy.getShape().drop_front(), allocTy.getElementType(), allocTy.getEncoding(), sharedMemorySpace, /*mutableMemory=*/true); auto viewLoad = builder.create(loc, subviewTy, alloc, loadOffsets); - Operation *lds_store = + auto storeOp = builder.create(loc, copy->getResult(0), viewLoad); - { - // Clean up old local caches. - SmallVector allocsToErase; - for (Operation *user : loadOp->getUsers()) { - if (auto alloc = dyn_cast(user)) { - alloc.replaceAllUsesWith(viewLoad.getResult()); - allocsToErase.push_back(alloc); - } - } - for (auto alloc : allocsToErase) { - alloc.erase(); + // Clean up old local caches. + SmallVector allocsToErase; + for (Operation *user : loadOp->getUsers()) { + if (auto alloc = dyn_cast(user)) { + alloc.replaceAllUsesWith(viewLoad.getResult()); + allocsToErase.push_back(alloc); } + } + for (auto alloc : allocsToErase) + alloc.erase(); - auto sharedLoad = - builder.create(loc, loadOp.getType(), viewLoad); - auto result = sharedLoad->getResults(); + auto sharedLoad = + builder.create(loc, loadOp.getType(), viewLoad); + auto result = sharedLoad->getResults(); - // Create a select for non-zero other values. - Value other = loadOp.getOther(); - if (other && !isZeroConst(other)) { - auto select = builder.create( - loc, loadOp.getType(), mask, sharedLoad.getResult(), other); - result = select->getResults(); - } + // Create a select for non-zero other values. + Value other = loadOp.getOther(); + if (other && !isZeroConst(other)) { + auto select = builder.create( + loc, loadOp.getType(), mask, sharedLoad.getResult(), other); + result = select->getResults(); + } - loadOp->replaceAllUsesWith(result); + loadOp->replaceAllUsesWith(result); - // Prefetch load if is used by the dot. - if (loadToInfo[loadOp].usedByDot) { - schedule.insert(lds_store, numStages - 2, prefetchCluster); - schedule.insert(viewLoad, numStages - 2, prefetchCluster); - } + // Prefetch load if is used by the dot. + if (loadToInfo[loadOp].usedByDot) { + schedule.insert(storeOp, numStages - 2, prefetchCluster); + schedule.insert(viewLoad, numStages - 2, prefetchCluster); } loadOp.erase(); } @@ -191,8 +186,9 @@ getBlockedEncoding(tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfo) { threadsPerWarp, ctaLayout); } -// Create a map from load ops to their indirection level and the -// final use of the load op (another load op, or a dot op). +// Create a map from load ops to their indirection levels and the final uses +// of the load op (another load op, or a dot op). +// // Indirection level is "0" for the load op directly used by the dot op, // "1" for the load op used by the load op used by the dot op, and so on. static llvm::SmallVector> @@ -201,19 +197,22 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { loadOpToIndLevelAndUse; DenseSet seen; - std::function dfs = + // Recursively visit the given op and its operands to discover all load ops + // and collect their indirection levels and uses. + std::function dfs = [&](Operation *op, int distance, Operation *use) { + // Skip previously visisted load ops. if (!seen.insert(op).second) return; + if (isa(op)) { // TODO: What if there are multiple uses at different distances? - loadOpToIndLevelAndUse.push_back(std::make_tuple(op, distance, use)); + loadOpToIndLevelAndUse.emplace_back(op, distance, use); use = op; - distance++; + ++distance; } for (Value operand : op->getOperands()) { - Value v = operand; - Operation *defOp = v.getDefiningOp(); + Operation *defOp = operand.getDefiningOp(); if (defOp && defOp->getBlock() == op->getBlock()) { dfs(defOp, distance, use); } @@ -239,6 +238,8 @@ loadOpsToIndirectionLevelAndUse(scf::ForOp forOp) { return loadOpToIndLevelAndUse; } +// Goes through all load ops to identify those that can be pipelined and assign +// layout to them. static llvm::MapVector assignMemoryLayouts(llvm::SmallVector> &loadOpToIndLevelAndUse, @@ -247,11 +248,11 @@ assignMemoryLayouts(llvm::SmallVector> for (auto &[op, dist, use] : loadOpToIndLevelAndUse) { if (loadToInfo.count(op)) - // TODO pawel: err, we'd need to verify that the distance is the same + // TODO We'd need to verify that the distance is the same continue; - LoadInfo loadInfo; - auto loadOp = dyn_cast(op); + LoadInfo loadInfo; + auto loadOp = cast(op); assert(!isLoadFromTensorPtr(loadOp) && "Block ptr should have been lowered before this pass."); auto ptr = loadOp.getPtr(); @@ -260,31 +261,37 @@ assignMemoryLayouts(llvm::SmallVector> vec = std::min(vec, axisInfoAnalysis.getMaskAlignment(mask)); auto tensorTy = dyn_cast(ptr.getType()); - if (!tensorTy) + if (!tensorTy) { + LDBG("Skip non-tensor load " << *loadOp); continue; + } - auto ty = cast(tensorTy.getElementType()).getPointeeType(); - unsigned width = vec * ty.getIntOrFloatBitWidth(); + auto pointeeTy = + cast(tensorTy.getElementType()).getPointeeType(); + unsigned width = vec * pointeeTy.getIntOrFloatBitWidth(); // Limit shared memory sharing to width >= 32 elements. LDBG("Load " << *loadOp << " has width " << width); - if (width < 32) + if (width < 32) { + LDBG("Skip width<32 load " << *loadOp); continue; + } if (use->hasTrait()) { - // Only use shared memory when feeding a dot op + // Only use shared memory when feeding into a dot op. loadInfo.usedByDot = true; loadInfo.sharedEncoding = getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr); - } else if (auto loadOp = dyn_cast(use)) { + } else if (auto useOp = dyn_cast(use)) { // The use of this loadOp is another loadOp. If the use is not in the - // loadsToPipeline already, it means that the use is not valid for - // pipelining for some reason. We should skip this loadOp, too. Note that - // we have an assumption that distAndUse.second (i.e. the use of this - // loadOp) has already be processed in a previous loop iteration. This - // assumption is held by how loadOpsToIndirectionLevelAndUse recursively - // collects loadOpToIndLevelAndUse using DFS. - if (loadToInfo.count(loadOp) == 0) { + // loadToInfo already, it means that the use is not valid for pipelining + // for some reason. We should skip this loadOp, too. + // + // Note that we have an assumption that the use of this loadOp has already + // be processed in a previous loop iteration. This assumption is held by + // how loadOpsToIndirectionLevelAndUse recursively collects + // loadOpToIndLevelAndUse using DFS. + if (loadToInfo.count(useOp) == 0) { continue; } } @@ -323,53 +330,55 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, if (loadOpToIndLevelAndUse.empty()) return {}; - // Check which loads are good for pipelining, and assign them - // memory layouts. + // Check which loads are good for pipelining, and assign them memory layouts. llvm::MapVector loadToInfo = assignMemoryLayouts(loadOpToIndLevelAndUse, axisInfoAnalysis); - if (loadToInfo.empty()) return {}; + // Filter out load ops that cannot be pipelined. + int resize = 0; + for (int i = 0, e = loadOpToIndLevelAndUse.size(); i < e; ++i) { + auto [loadOp, distance, use] = loadOpToIndLevelAndUse[i]; + if (loadToInfo.count(loadOp) != 0) + loadOpToIndLevelAndUse[resize++] = loadOpToIndLevelAndUse[i]; + } + loadOpToIndLevelAndUse.resize(resize); + // Calculate the stage distance between applicable loads. int maxIndirectionLevel = -1; for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; maxIndirectionLevel = std::max(maxIndirectionLevel, dist); } + // The stage gap between chained loads--this allows us to "spread" loads + // with a non-one step in case the number of stages given by the user is + // large. unsigned stagesBetweenLoads = - ceil(numStages - 2, maxIndirectionLevel + 1); + llvm::divideCeil(numStages - 2, maxIndirectionLevel + 1); - tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); // Put the root uses of the loads in the last stage. + tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); for (auto &[loadOp, dist, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; - // Non-LoadOp(s) are the root uses of all LoadOp(s) and should be - // always present in the opInfo + // Non-LoadOp(s) are the (final) root uses of all LoadOp(s). if (!isa(use)) { schedule.insert(use, numStages - 1, rootUsersCluster); rootUsers.insert(use); } } + // Create a cluster for load ops at each indirection level. SmallVector loadsClusters; - for (int i = 0; i < maxIndirectionLevel + 1; i++) { + for (int i = 0; i <= maxIndirectionLevel; i++) { loadsClusters.push_back(schedule.clusters.newAtBack()); } // Assign stages to the loads. for (auto [loadOp, indLevel, _] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; int stage = (maxIndirectionLevel - indLevel) * stagesBetweenLoads; schedule.insert(loadOp, stage, loadsClusters[indLevel]); } - // Distance from the load to the use. + // Calculate distance from the load to the use. for (auto [loadOp, _, use] : loadOpToIndLevelAndUse) { - if (loadToInfo.count(loadOp) == 0) - continue; loadToInfo[loadOp].distToUse = schedule[use].first - schedule[loadOp].first; } @@ -397,7 +406,7 @@ static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, static void scheduleDistanceOneDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, int numStages) { - auto getNestedOperands = [](Operation *op) -> SmallVector { + auto getNestedOperands = [](Operation *op) { SmallVector operands; op->walk([&](Operation *nestedOp) { for (Value operand : nestedOp->getOperands()) { @@ -493,55 +502,49 @@ static Value createAlloc(scf::ForOp &forOp, Operation *loadOp, auto ty = cast(loadOp->getResultTypes()[0]); SmallVector bufferShape(ty.getShape().begin(), ty.getShape().end()); bufferShape.insert(bufferShape.begin(), distance); - Type memdescType = mlir::triton::MemDescType::get( - bufferShape, ty.getElementType(), sharedEnc, sharedMemorySpace, - /*mutableMemory*/ true); - Value alloc = builder.create( - loadOp->getLoc(), memdescType, Value()); - return alloc; + Type memdescType = tt::MemDescType::get(bufferShape, ty.getElementType(), + sharedEnc, sharedMemorySpace, + /*mutableMemory=*/true); + return builder.create(loadOp->getLoc(), memdescType, + Value()); } -// Convert load ops into their asyn version and apply multi-buffering based on -// the required number of buffers. +// Convert load ops into shared memory allocation loads and apply +// multi-buffering based on the required number of buffers. static SmallVector createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, llvm::MapVector &loadToInfo, int numStages) { // Calculate the number of buffers needed for each load. - // TODO pawel: we could do more fine-grained allocation here and - // allocate only the number of buffers that specific loads need. - // Instead, we allocate the maximum number of buffers needed by any load. - int numBuffers = - llvm::max_element(llvm::make_second_range(loadToInfo), [](auto &lhs, - auto &rhs) { - return lhs.distToUse < rhs.distToUse; - })->distToUse; - - SmallVector> asyncLoads; + // TODO: Use the precise number of buffers needed by the particular load. + int numBuffers = -1; + for (auto &[_, info] : loadToInfo) + numBuffers = std::max(numBuffers, info.distToUse); + SmallVector allocs; + SmallVector> loadToAllocs; for (auto &[loadOp, info] : loadToInfo) { - // assert(info.sharedEncoding && "LoadOp shared encoding not defined."); - if (info.sharedEncoding) { - Value alloc = createAlloc(forOp, loadOp, info.sharedEncoding, numBuffers); - assert(alloc && "Failed to create alloc for the async load."); - allocs.push_back(alloc); - asyncLoads.emplace_back(loadOp, alloc); - } + if (!info.sharedEncoding) + continue; + + Value alloc = createAlloc(forOp, loadOp, info.sharedEncoding, numBuffers); + assert(alloc && "Failed to create alloc for the async load."); + allocs.push_back(alloc); + loadToAllocs.emplace_back(loadOp, alloc); } IRRewriter builder(forOp.getContext()); builder.setInsertionPoint(forOp); Location loc = forOp.getLoc(); - // Create two new counters to index into the allocs. Value minusOne = builder.create(loc, -1, 32); Value zero = builder.create(loc, 0, 32); Value one = builder.create(loc, 1, 32); Value insertIdx = minusOne; Value extractIdx = minusOne; - Value phase = Value(); Value numBuffersVal = builder.create(loc, numBuffers, 32); + SmallVector newOperands; newOperands.push_back(insertIdx); newOperands.push_back(extractIdx); @@ -552,14 +555,12 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, replaceForOpWithNewSignature(builder, forOp, newOperands); forOp.erase(); forOp = newForOp; - insertIdx = newForOp.getBody()->getArgument(newOperandIndex); - extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); - if (phase) { - phase = newForOp.getBody()->getArgument(newOperandIndex + 2); - } // Create two counters for the insert and extract indices to avoid creating // long liverange. + insertIdx = newForOp.getBody()->getArgument(newOperandIndex); + extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); + builder.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); insertIdx = builder.create(loc, insertIdx, one); Value cndIns = builder.create(loc, arith::CmpIPredicate::slt, @@ -570,33 +571,26 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, Value cndExt = builder.create(loc, arith::CmpIPredicate::slt, extractIdx, numBuffersVal); extractIdx = builder.create(loc, cndExt, extractIdx, zero); - if (phase) { - Value nextPhase = builder.create(loc, phase, one); - phase = builder.create(loc, cndExt, phase, nextPhase); - } // Create a cluster for the prefetches. It may end up being empty, but this // is OK. tt::CoarseSchedule::Cluster prefetchCluster = schedule.clusters.newAtBack(); - for (auto &pair : asyncLoads) { - if (auto loadOp = dyn_cast(pair.first)) { - createStreamCopy(forOp, loadOp, pair.second, insertIdx, extractIdx, - schedule, prefetchCluster, loadToInfo, numStages); + for (auto &[op, alloc] : loadToAllocs) { + if (auto loadOp = dyn_cast(op)) { + createStreamCopy(forOp, loadOp, alloc, insertIdx, extractIdx, schedule, + prefetchCluster, loadToInfo, numStages); } } SmallVector newYieldOperands = {insertIdx, extractIdx}; - if (phase) - newYieldOperands.push_back(phase); // Patch the yield with the updated counters. appendToYield(forOp, newYieldOperands); return allocs; } -static bool -preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, - mlir::triton::PipeliningOption &options) { +static bool preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, + tt::PipeliningOption &options) { // Schedule the loads and root ops (dot ops) in the loop. This will give us // a scaffold for the final schedule. DenseSet rootUsers; @@ -607,16 +601,16 @@ preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, return false; LLVM_DEBUG({ - LDBG("Coarse schedule loads only:"); + LDBG("\nCoarse schedule loads only:"); coarseSchedule.dump(); }); - // Convert the loads into async loads and create the allocs. + // Convert the loads into shared memory allocations and loads from them. SmallVector allocs = createStreamOps(forOp, coarseSchedule, loadToInfo, numStages); LLVM_DEBUG({ - LDBG("Coarse schedule with stream loads:"); + LDBG("\nCoarse schedule with stream loads:"); coarseSchedule.dump(); }); @@ -624,19 +618,19 @@ preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, scheduleDependencies(forOp, coarseSchedule, numStages); LLVM_DEBUG({ - LDBG("Coarse schedule with dependencies:"); + LDBG("\nCoarse schedule with dependencies:"); coarseSchedule.dump(); }); scheduleDistanceOneDependencies(forOp, coarseSchedule, numStages); LLVM_DEBUG({ - LDBG("Coarse schedule with dist 1:"); + LDBG("\nCoarse schedule with dist 1:"); coarseSchedule.dump(); }); scheduleRemainingToLastStage(forOp, coarseSchedule, afterPrologue, numStages); LLVM_DEBUG({ - LDBG("Final coarse schedule:"); + LDBG("\nFinal coarse schedule:"); coarseSchedule.dump(); }); @@ -647,69 +641,55 @@ preProcessLoopAndGetSchedule2(scf::ForOp &forOp, int numStages, // Fill out the pipeline options. options.getScheduleFn = - [schedule](scf::ForOp forOp, - std::vector> &s) { + [schedule](scf::ForOp, std::vector> &s) { s = std::move(schedule); }; options.peelEpilogue = false; options.predicateFn = tt::predicateOp; options.supportDynamicLoops = true; - options.annotateFn = [](Operation *op, - mlir::triton::PipeliningOption::PipelinerPart part, - unsigned iteration) {}; - // Insert a wait 0 after the loop + OpBuilder builder(forOp); builder.setInsertionPointAfter(forOp); - // Explicitly deallocate allocated tensors after the wait op + // Explicitly deallocate created allocations. for (auto alloc : allocs) builder.create(forOp.getLoc(), alloc); return true; } // Return true if the preconditions for pipelining the loop are met. -static bool preConditionInner(scf::ForOp forOp) { +static bool checkPrecondition(scf::ForOp forOp) { // Skip loop with distance > 1 for now. // TODO: relax the constraint in the expander. if (llvm::any_of(forOp.getBody()->getTerminator()->getOperands(), - [](Value operand) { - Operation *def = operand.getDefiningOp(); - return !def; - })) + [](Value operand) { return !operand.getDefiningOp(); })) return false; + // Don't pipeline outer loops. - if (forOp - ->walk([&](Operation *op) { - if (forOp.getOperation() == op) - return WalkResult::advance(); - if (isa(op)) - return WalkResult::interrupt(); - return WalkResult::advance(); - }) - .wasInterrupted()) - return false; - return true; + auto hasNestedLoopInside = [forOp](Operation *op) { + if (op != forOp && isa(op)) + return WalkResult::interrupt(); + return WalkResult::advance(); + }; + return !forOp->walk(hasNestedLoopInside).wasInterrupted(); } static bool pipelineLoop(scf::ForOp forOp, int numStages) { - mlir::triton::PipeliningOption options; - if (!preConditionInner(forOp)) + if (!checkPrecondition(forOp)) return false; - bool foundSchedule = false; - foundSchedule = preProcessLoopAndGetSchedule2(forOp, numStages, options); - - // TODO: add more pipelines strategy. - if (!foundSchedule) + tt::PipeliningOption options; + if (!preprocessLoopAndBuildSchedule(forOp, numStages, options)) return false; IRRewriter rewriter(forOp->getContext()); rewriter.setInsertionPoint(forOp); FailureOr newForOp = - mlir::triton::pipelineForLoop(rewriter, forOp, options); + tt::pipelineForLoop(rewriter, forOp, options); if (failed(newForOp)) return false; - LDBG("Loop before expander\n" << *newForOp); + + LDBG("Loop before expander:\n" << *newForOp); return true; } @@ -718,15 +698,6 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineV2Base { PipelinePass() = default; PipelinePass(int32_t numStages) { this->numStages = numStages; } - int getNumStagesOrDefault(scf::ForOp forOp) { - // Use the attribute attached to the loop if it exists otherwise use the - // global control. - if (auto attr = - forOp->getAttrOfType(mlir::triton::kNumStagesAttrName)) - return attr.getInt(); - return numStages; - } - void runOnOperation() override { SmallVector loops; getOperation()->walk([&](scf::ForOp forOp) { @@ -738,12 +709,17 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineV2Base { if (loops.empty()) return; - bool pipelined = false; - for (scf::ForOp forOp : loops) { - auto outerLoop = dyn_cast(forOp->getParentOp()); - int loopNumStages = getNumStagesOrDefault(forOp); - pipelined |= pipelineLoop(forOp, loopNumStages); - } + for (scf::ForOp forOp : loops) + pipelineLoop(forOp, getNumStagesOrDefault(forOp)); + } + +private: + int getNumStagesOrDefault(scf::ForOp forOp) { + // Use the attribute attached to the loop if it exists, otherwise use the + // global control. + if (auto attr = forOp->getAttrOfType(tt::kNumStagesAttrName)) + return attr.getInt(); + return numStages; } }; } // anonymous namespace From 9bbf5c9d3dd0c7474567d34b91939cb55fd27036 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 25 Jul 2024 17:05:13 +0000 Subject: [PATCH 23/36] NFC: change check prefix to AMD --- test/TritonGPU/amd/amd-stream-pipeline.mlir | 836 ++++++++++---------- 1 file changed, 418 insertions(+), 418 deletions(-) diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir index 3f09d2fa8123..8d1f9fd0338b 100644 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ b/test/TritonGPU/amd/amd-stream-pipeline.mlir @@ -1,4 +1,4 @@ -// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefix=AMD // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 @@ -11,47 +11,47 @@ #A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> #B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -// CHECK-LABEL: tt.func @matmul_loop -// CHECK: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] -// CHECK: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// CHECK: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// CHECK: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} -// CHECK: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] -// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } - -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// AMD-LABEL: tt.func @matmul_loop +// AMD: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// AMD: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// AMD: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} +// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] +// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] +// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } + +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "hip:gfx942"} { tt.func @matmul_loop(%lb : index, %ub : index, %step : index, @@ -98,48 +98,48 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, tt.return %loop#2: tensor<128x128xf32, #C> } -// CHECK-LABEL: tt.func @matmul_loop_nested -// CHECK: scf.for -// CHECK: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// CHECK: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] -// CHECK: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) - -// CHECK: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// CHECK: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} -// CHECK: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] -// CHECK: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} -// CHECK: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] -// CHECK: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -// CHECK: scf.yield %{{.*}}#2 -// CHECK: } +// AMD-LABEL: tt.func @matmul_loop_nested +// AMD: scf.for +// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// AMD: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] +// AMD: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) + +// AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] +// AMD: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} +// AMD: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} +// AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] +// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] +// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] +// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} +// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] +// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// AMD: scf.yield %{{.*}}#2 +// AMD: } tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ @@ -185,34 +185,34 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, tt.return %loop1#0 : tensor<128x128xf32, #C> } -// CHECK-LABEL: tt.func @matmul_loop_single_pipeline -// CHECK: %[[LOAD_10:.*]] = tt.load -// CHECK: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] -// CHECK: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// CHECK: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// CHECK: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) -// CHECK: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] -// CHECK: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// CHECK: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] -// CHECK: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// CHECK: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// CHECK: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] -// CHECK: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} -// CHECK: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} -// CHECK: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} -// CHECK: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] -// CHECK: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// AMD-LABEL: tt.func @matmul_loop_single_pipeline +// AMD: %[[LOAD_10:.*]] = tt.load +// AMD: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] +// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// AMD: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] +// AMD: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// AMD: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] +// AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// AMD: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} +// AMD: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} +// AMD: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] +// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -251,65 +251,65 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, tt.return %loop#1 : tensor<128x128xf32, #C> } -// CHECK-LABEL: tt.func @indirect_bmm_scalar -// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// CHECK: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] -// CHECK: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] -// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] -// CHECK: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] -// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] -// CHECK: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] -// CHECK: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] -// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] -// CHECK: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] -// CHECK: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] -// CHECK: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] -// CHECK: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] -// CHECK: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] -// CHECK: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] -// CHECK: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) - -// CHECK: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// CHECK: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// CHECK: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] -// CHECK: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] -// CHECK: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] -// CHECK: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] -// CHECK: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] -// CHECK: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] -// CHECK: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] -// CHECK: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] -// CHECK: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} -// CHECK: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] -// CHECK: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] -// CHECK: } - -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] +// AMD-LABEL: tt.func @indirect_bmm_scalar +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] +// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] +// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] +// AMD: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] +// AMD: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] +// AMD: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] +// AMD: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] +// AMD: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] +// AMD: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] +// AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] +// AMD: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) + +// AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// AMD: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// AMD: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] +// AMD: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] +// AMD: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] +// AMD: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] +// AMD: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] +// AMD: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] +// AMD: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] +// AMD: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} +// AMD: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] +// AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] +// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// AMD: } + +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, %76: index, @@ -340,56 +340,56 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, tt.return %79#0 : tensor<16x16xf32, #C> } -// CHECK-LABEL: tt.func @indirect_bmm_scalar_dist_one -// CHECK: %[[LOAD_0:.*]] = tt.load %{{.*}} -// CHECK: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] -// CHECK: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] -// CHECK: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] -// CHECK: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] -// CHECK: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] -// CHECK: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] -// CHECK: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] -// CHECK: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] -// CHECK: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) - -// CHECK: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// CHECK: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// CHECK: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] -// CHECK: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// CHECK: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] -// CHECK: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] -// CHECK: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] -// CHECK: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] -// CHECK: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// CHECK: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// CHECK: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] -// CHECK: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] +// AMD-LABEL: tt.func @indirect_bmm_scalar_dist_one +// AMD: %[[LOAD_0:.*]] = tt.load %{{.*}} +// AMD: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] +// AMD: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] +// AMD: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] +// AMD: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] +// AMD: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] +// AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) + +// AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// AMD: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// AMD: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] +// AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] +// AMD: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// AMD: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// AMD: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, %76: index, @@ -422,63 +422,63 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, tt.return %79#0 : tensor<16x16xf32, #C> } -// CHECK-LABEL: tt.func @indirect_bmm_vector -// CHECK: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// CHECK: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// CHECK: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// CHECK: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] -// CHECK: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] -// CHECK: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] -// CHECK: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] -// CHECK: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] -// CHECK: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] -// CHECK: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] -// CHECK: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// CHECK: } -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// CHECK: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] +// AMD-LABEL: tt.func @indirect_bmm_vector +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] +// AMD: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} +// AMD: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] +// AMD: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, @@ -511,12 +511,12 @@ tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 tt.return %79#0 : tensor<16x16xf32, #C> } -// CHECK-LABEL: tt.func @post_load_inv -// CHECK: scf.for -// CHECK-DAG: %[[IV:.*]] = arith.index_cast -// CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 -// CHECK: arith.index_cast -// CHECK-NOT: arith.addi %[[NEXT_IV]] +// AMD-LABEL: tt.func @post_load_inv +// AMD: scf.for +// AMD-DAG: %[[IV:.*]] = arith.index_cast +// AMD: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 +// AMD: arith.index_cast +// AMD-NOT: arith.addi %[[NEXT_IV]] tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -567,11 +567,11 @@ tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, tt.return %85#0 : tensor<32x32xf32, #C> } -// CHECK-LABEL: tt.func @cross_iter_dep +// AMD-LABEL: tt.func @cross_iter_dep // TODO: enable pipelining with distance of 2 -// CHECK-NOT: triton_gpu.local_load -// CHECK: scf.for -// CHECK: scf.yield +// AMD-NOT: triton_gpu.local_load +// AMD: scf.for +// AMD: scf.yield tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -624,14 +624,14 @@ tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, tt.return %119#0 : tensor<32x32xf32, #C> } -// CHECK-LABEL: tt.func @dep_arg_two_uses -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims %arg5 -// CHECK-NEXT: tt.expand_dims %arg5 -// CHECK: %[[PTR0:.*]] = tt.splat %arg6 -// CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] -// CHECK-NEXT: tt.load %[[PTR1]] +// AMD-LABEL: tt.func @dep_arg_two_uses +// AMD: tt.expand_dims +// AMD: tt.expand_dims +// AMD: tt.expand_dims %arg5 +// AMD-NEXT: tt.expand_dims %arg5 +// AMD: %[[PTR0:.*]] = tt.splat %arg6 +// AMD: %[[PTR1:.*]] = tt.addptr %[[PTR0]] +// AMD-NEXT: tt.load %[[PTR1]] tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -698,7 +698,7 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -// CHECK-LABEL: tt.func @load_two_users +// AMD-LABEL: tt.func @load_two_users tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> @@ -725,13 +725,13 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: triton_gpu.local_store - // CHECK: scf.for - // CHECK: tt.dot - // CHECK: tt.dot - // CHECK: tt.load - // CHECK: triton_gpu.local_store - // CHECK: scf.yield + // AMD: triton_gpu.local_store + // AMD: scf.for + // AMD: tt.dot + // AMD: tt.dot + // AMD: tt.load + // AMD: triton_gpu.local_store + // AMD: scf.yield %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> @@ -758,7 +758,7 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts +// AMD-LABEL: tt.func @load_two_users_incompatible_layouts tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> @@ -785,8 +785,8 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK-NOT: triton_gpu.local_store - // CHECK: scf.for + // AMD-NOT: triton_gpu.local_store + // AMD: scf.for %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> @@ -806,13 +806,13 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// CHECK-LABEL: tt.func public @nested_loops -// CHECK: scf.for -// CHECK: triton_gpu.local_alloc -// CHECK-NOT: triton_gpu.local_alloc -// CHECK: scf.for -// CHECK: scf.yield -// CHECK-DIS: scf.yield +// AMD-LABEL: tt.func public @nested_loops +// AMD: scf.for +// AMD: triton_gpu.local_alloc +// AMD-NOT: triton_gpu.local_alloc +// AMD: scf.for +// AMD: scf.yield +// AMD-DIS: scf.yield // // The following code has the structure: // @@ -831,7 +831,7 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // particular while predicating the operations scheduled to be emitted // in the prologue. // -// We check that there is no allocation before the first occurrence of +// We AMD that there is no allocation before the first occurrence of // scf.for because that would mean that the first load `%a = load()` // would be pipelined. #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -891,8 +891,8 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de -// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> +// AMD-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de +// AMD-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> @@ -993,41 +993,41 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 } // end module // ----- -// CHECK-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> -// CHECK-LABEL: tt.func @indirect_load_shared_layout -// CHECK: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// CHECK: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// CHECK: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// CHECK: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// CHECK: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// CHECK: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// CHECK: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// CHECK: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// CHECK: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// CHECK: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// CHECK: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// CHECK: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// CHECK: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// CHECK: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// CHECK: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// CHECK: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// CHECK: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// CHECK: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// CHECK: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// CHECK: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// CHECK: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// CHECK: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// CHECK: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// CHECK: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// CHECK: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// CHECK: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// CHECK: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// CHECK: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// CHECK: } +// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> +// AMD-LABEL: tt.func @indirect_load_shared_layout +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: } #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -1071,15 +1071,15 @@ tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibilit // ----- -// CHECK-LABEL: @kernel_yield_constant -// CHECK: tt.load -// CHECK: triton_gpu.memdesc_subview -// CHECK: triton_gpu.local_store -// CHECK: scf.for -// CHECK: tt.load -// CHECK: triton_gpu.memdesc_subview -// CHECK: triton_gpu.local_store -// CHECK: tt.return +// AMD-LABEL: @kernel_yield_constant +// AMD: tt.load +// AMD: triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store +// AMD: scf.for +// AMD: tt.load +// AMD: triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store +// AMD: tt.return #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { @@ -1122,19 +1122,19 @@ module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : // ----- -// CHECK-LABEL: tt.func public @add_kernel -// CHECK: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} -// CHECK: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// CHECK: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} -// CHECK: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} -// CHECK: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] -// CHECK: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} -// CHECK: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} -// CHECK: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// CHECK: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] -// CHECK: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// CHECK: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] -// CHECK: scf.for +// AMD-LABEL: tt.func public @add_kernel +// AMD: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} +// AMD: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] +// AMD: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} +// AMD: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} +// AMD: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// AMD: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] +// AMD: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] +// AMD: scf.for #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { @@ -1168,16 +1168,16 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// CHECK-LABEL: tt.func public @nested_loops -// CHECK-NOT: triton_gpu.local_alloc -// CHECK: scf.for -// CHECK: triton_gpu.local_alloc -// CHECK: scf.for -// CHECK: triton_gpu.local_load -// CHECK: tt.dot -// CHECK: triton_gpu.local_store -// CHECK: scf.yield -// CHECK: triton_gpu.local_dealloc +// AMD-LABEL: tt.func public @nested_loops +// AMD-NOT: triton_gpu.local_alloc +// AMD: scf.for +// AMD: triton_gpu.local_alloc +// AMD: scf.for +// AMD: triton_gpu.local_load +// AMD: tt.dot +// AMD: triton_gpu.local_store +// AMD: scf.yield +// AMD: triton_gpu.local_dealloc #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> @@ -1220,8 +1220,8 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- // This test triggered some failure in the verifier, so we only -// included a simple check for the kernel name. -// CHECK-LABEL: @load_convert_layout +// included a simple AMD for the kernel name. +// AMD-LABEL: @load_convert_layout #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> @@ -1271,8 +1271,8 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // ----- // This test captured some ICE in MatmulLoopPipeline pass, so we only -// included a simple check for the kernel name. -// CHECK-LABEL: @matmul_indirect_pipeline +// included a simple AMD for the kernel name. +// AMD-LABEL: @matmul_indirect_pipeline #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { @@ -1315,8 +1315,8 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// CHECK-LABEL: @dont_pipeline_128x1 -// CHECK-NOT: local_load{{.*}}128x1 +// AMD-LABEL: @dont_pipeline_128x1 +// AMD-NOT: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { @@ -1356,10 +1356,10 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// Check that the dependencies across ops of different nesting does not cause crash or +// AMD that the dependencies across ops of different nesting does not cause crash or // incorrect schedule that fails to pipeline. -// CHECK-LABEL: @matmul_nested_ops -// CHECK: triton_gpu.local_load +// AMD-LABEL: @matmul_nested_ops +// AMD: triton_gpu.local_load #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -1429,8 +1429,8 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - // CHECK-LABEL: dot_prologue_epilogue - // CHECK-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + // AMD-LABEL: dot_prologue_epilogue + // AMD-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> @@ -1453,17 +1453,17 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] to - // CHECK-NOT: load - // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // CHECK: scf.if %[[CND]] - // CHECK: dot - // CHECK: scf.if %[[CND]] - // CHECK: arith.mulf - // CHECK: scf.yield - // CHECK-NOT: tt.addptr - // CHECK: scf.yield + // AMD: %[[C0:.*]] = arith.constant 0 : i32 + // AMD: scf.for %[[IND_VAR:.*]] = %[[C0]] to + // AMD-NOT: load + // AMD: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // AMD: scf.if %[[CND]] + // AMD: dot + // AMD: scf.if %[[CND]] + // AMD: arith.mulf + // AMD: scf.yield + // AMD-NOT: tt.addptr + // AMD: scf.yield %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> %cnd = arith.cmpi slt, %arg3, %ext : i32 @@ -1493,18 +1493,18 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // ----- -// CHECK-LABEL: @masked_add_kernel -// CHECK: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: scf.for -// CHECK: arith.select -// CHECK: arith.select -// CHECK: arith.addf -// CHECK: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// CHECK: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD-LABEL: @masked_add_kernel +// AMD: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: scf.for +// AMD: arith.select +// AMD: arith.select +// AMD: arith.addf +// AMD: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { From 02b707339af5e7cd0e524a6a8d587d0ff1dd50b0 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 25 Jul 2024 23:25:28 +0000 Subject: [PATCH 24/36] Drop debug print \n --- .../lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 8f167f133def..5812893586c7 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -601,7 +601,7 @@ static bool preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, return false; LLVM_DEBUG({ - LDBG("\nCoarse schedule loads only:"); + LDBG("Coarse schedule loads only:"); coarseSchedule.dump(); }); @@ -610,7 +610,7 @@ static bool preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, createStreamOps(forOp, coarseSchedule, loadToInfo, numStages); LLVM_DEBUG({ - LDBG("\nCoarse schedule with stream loads:"); + LDBG("Coarse schedule with stream loads:"); coarseSchedule.dump(); }); @@ -618,19 +618,19 @@ static bool preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages, scheduleDependencies(forOp, coarseSchedule, numStages); LLVM_DEBUG({ - LDBG("\nCoarse schedule with dependencies:"); + LDBG("Coarse schedule with dependencies:"); coarseSchedule.dump(); }); scheduleDistanceOneDependencies(forOp, coarseSchedule, numStages); LLVM_DEBUG({ - LDBG("\nCoarse schedule with dist 1:"); + LDBG("Coarse schedule with dist 1:"); coarseSchedule.dump(); }); scheduleRemainingToLastStage(forOp, coarseSchedule, afterPrologue, numStages); LLVM_DEBUG({ - LDBG("\nFinal coarse schedule:"); + LDBG("Final coarse schedule:"); coarseSchedule.dump(); }); From c7826687faffcf8a45e5b7c2763570e6ddc30b56 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 25 Jul 2024 23:47:11 +0000 Subject: [PATCH 25/36] [test] NFC: split loop pipeline test to prepare sharing --- test/TritonGPU/loop-pipeline-cuda.mlir | 162 +++++++++++++++++++++++++ test/TritonGPU/loop-pipeline.mlir | 162 ------------------------- 2 files changed, 162 insertions(+), 162 deletions(-) create mode 100644 test/TritonGPU/loop-pipeline-cuda.mlir diff --git a/test/TritonGPU/loop-pipeline-cuda.mlir b/test/TritonGPU/loop-pipeline-cuda.mlir new file mode 100644 index 000000000000..cb8f04a8f52f --- /dev/null +++ b/test/TritonGPU/loop-pipeline-cuda.mlir @@ -0,0 +1,162 @@ +// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +// CHECK-LABEL: tt.func @load_two_users + tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} + // CHECK: scf.for + // CHECK: tt.dot + // CHECK: tt.dot + // CHECK: triton_gpu.async_copy_global_to_local + // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} + // CHECK: scf.yield + // CHECK: triton_gpu.async_wait {num = 0 : i32} + + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { + tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.get_program_id y : i32 + %3 = tt.load %arg3 : !tt.ptr + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> + %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> + %11 = arith.extsi %arg5 : i32 to i64 + %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> + %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> + %14 = arith.muli %2, %arg5 : i32 + %15 = arith.extsi %14 : i32 to i64 + %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> + %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> + %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> + %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> + %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> + %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> + %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> + %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> + %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> + %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> + %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> + %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> + %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> + %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> + %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> + %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> + %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> + %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> + %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> + %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> + %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> + %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> + %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> + %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> + %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> + %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> + %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> + %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> + %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> + %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> + %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> + %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { + %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> + %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> + %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> + %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> + %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + scf.yield %79 : tensor<64x32xf32, #mma> + } + %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> + %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> + %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> + %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> + %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> + %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> + tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> + tt.return + } +} // end module diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 80444b152616..9967fa85239c 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -577,67 +577,6 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, // ----- -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { -// CHECK-LABEL: tt.func @load_two_users - tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> - %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} - // CHECK: scf.for - // CHECK: tt.dot - // CHECK: tt.dot - // CHECK: triton_gpu.async_copy_global_to_local - // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} - // CHECK: scf.yield - // CHECK: triton_gpu.async_wait {num = 0 : i32} - - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { - %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } - tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- - #blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> @@ -781,107 +720,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } } // end module -// ----- - -// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [2, 2], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> - %c64_i32 = arith.constant 64 : i32 - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c64_i32 : i32 - %2 = tt.get_program_id y : i32 - %3 = tt.load %arg3 : !tt.ptr - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> - %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> - %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> - %11 = arith.extsi %arg5 : i32 to i64 - %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> - %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> - %14 = arith.muli %2, %arg5 : i32 - %15 = arith.extsi %14 : i32 to i64 - %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> - %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> - %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> - %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> - %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> - %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> - %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> - %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> - %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> - %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> - %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> - %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> - %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> - %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> - %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> - %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> - %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> - %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> - %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> - %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> - %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> - %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> - %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> - %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> - %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> - %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> - %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> - %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> - %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> - %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> - %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> - %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> - %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> - %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { - %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> - %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> - %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> - %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - scf.yield %79 : tensor<64x32xf32, #mma> - } - %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> - %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> - %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> - %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> - %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> - %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> - tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> - tt.return - } -} // end module // ----- // CHECK: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> From 8232d1ab9026dca348c4ea636770014b30a4fcd6 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 03:27:40 +0000 Subject: [PATCH 26/36] Merge tests back to the main file --- test/TritonGPU/amd/amd-stream-pipeline.mlir | 1538 ------------------- test/TritonGPU/loop-pipeline-hip.mlir | 162 ++ test/TritonGPU/loop-pipeline.mlir | 438 +++++- 3 files changed, 594 insertions(+), 1544 deletions(-) delete mode 100644 test/TritonGPU/amd/amd-stream-pipeline.mlir create mode 100644 test/TritonGPU/loop-pipeline-hip.mlir diff --git a/test/TritonGPU/amd/amd-stream-pipeline.mlir b/test/TritonGPU/amd/amd-stream-pipeline.mlir deleted file mode 100644 index 8d1f9fd0338b..000000000000 --- a/test/TritonGPU/amd/amd-stream-pipeline.mlir +++ /dev/null @@ -1,1538 +0,0 @@ -// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefix=AMD - -// 4 warps -// matmul: 128x32 @ 32x128 -> 128x128 -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -// AMD-LABEL: tt.func @matmul_loop -// AMD: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] -// AMD: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] -// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] -// AMD: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// AMD: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// AMD: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} -// AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} -// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] -// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] -// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } - -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] - -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "hip:gfx942"} { -tt.func @matmul_loop(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %b_scale = arith.constant dense<4.> : tensor<32x128xf16, #B> - - %loop:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - %b__ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b_ = triton_gpu.convert_layout %b__ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - %b = arith.mulf %b_, %b_scale: tensor<32x128xf16, #B> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - tt.return %loop#2: tensor<128x128xf32, #C> -} - -// AMD-LABEL: tt.func @matmul_loop_nested -// AMD: scf.for -// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// AMD: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] -// AMD: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) - -// AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// AMD: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} -// AMD: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} -// AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] -// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] -// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] -// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} -// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] -// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -// AMD: scf.yield %{{.*}}#2 -// AMD: } -tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ - - %c_start = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - %loop1:1 = scf.for %iv0 = %lb to %ub step %step iter_args(%c_init = %c_start) -> (tensor<128x128xf32, #C>) { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %loop2:3 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %a_ = tt.load %a_ptr, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_a_ptr, %next_b_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - - scf.yield %loop2#2 : tensor<128x128xf32, #C> - } - tt.return %loop1#0 : tensor<128x128xf32, #C> -} - -// AMD-LABEL: tt.func @matmul_loop_single_pipeline -// AMD: %[[LOAD_10:.*]] = tt.load -// AMD: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] -// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// AMD: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) -// AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] -// AMD: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// AMD: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} -// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] -// AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// AMD: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] -// AMD: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} -// AMD: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} -// AMD: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} -// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] -// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr_init = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - - %a_ = tt.load %a_ptr_init, %a_mask, %a_other : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %b_off = arith.constant dense<4> : tensor<32x128xi32, #BL> - - %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%b_ptr = %b_ptr_init, %prev_c = %c_init) -> (tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C>) { - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - %next_b_ptr = tt.addptr %b_ptr, %b_off : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - scf.yield %next_b_ptr, %c : tensor<32x128x!tt.ptr, #BL>, tensor<128x128xf32, #C> - } - tt.return %loop#1 : tensor<128x128xf32, #C> -} - -// AMD-LABEL: tt.func @indirect_bmm_scalar -// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] -// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] -// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] -// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] -// AMD: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] -// AMD: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] -// AMD: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] -// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] -// AMD: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] -// AMD: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] -// AMD: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] -// AMD: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] -// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] -// AMD: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] -// AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] -// AMD: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) - -// AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// AMD: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// AMD: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} -// AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] -// AMD: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] -// AMD: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] -// AMD: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] -// AMD: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] -// AMD: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] -// AMD: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] -// AMD: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] -// AMD: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] -// AMD: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} -// AMD: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] -// AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] -// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] -// AMD: } - -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] - -tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: !tt.ptr, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : !tt.ptr - %84 = arith.muli %77, %83 : i64 - %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// AMD-LABEL: tt.func @indirect_bmm_scalar_dist_one -// AMD: %[[LOAD_0:.*]] = tt.load %{{.*}} -// AMD: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] -// AMD: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] -// AMD: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] -// AMD: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] -// AMD: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] -// AMD: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] -// AMD: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] -// AMD: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] -// AMD: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] -// AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) - -// AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// AMD: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// AMD: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} -// AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] -// AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// AMD: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] -// AMD: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] -// AMD: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] -// AMD: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] -// AMD: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] -// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] - -tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: !tt.ptr, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %50 = tt.load %75 : !tt.ptr - %51 = tt.addptr %75, %c1_i32 : !tt.ptr, i32 - %79:4 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %51, %arg22 = %50) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : !tt.ptr - %84 = arith.muli %77, %arg22 : i64 - %85 = tt.splat %84 : i64 -> tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 - scf.yield %90, %91, %92, %83 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr, i64 - } - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// AMD-LABEL: tt.func @indirect_bmm_vector -// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// AMD: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] -// AMD: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} -// AMD: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] -// AMD: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] -// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] -// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] -// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] -// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] -// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] - -tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} - -// AMD-LABEL: tt.func @post_load_inv -// AMD: scf.for -// AMD-DAG: %[[IV:.*]] = arith.index_cast -// AMD: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 -// AMD: arith.index_cast -// AMD-NOT: arith.addi %[[NEXT_IV]] -tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}, - %arg3: i32 {tt.divisibility = 16 : i32}, - %arg4: i32 {tt.divisibility = 16 : i32}, - %arg5: i32 {tt.divisibility = 16 : i32}, - %arg6: i32 {tt.divisibility = 16 : i32}, - %arg7: i32 {tt.divisibility = 16 : i32}, - %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { - %c0_index = arith.constant 0 : index - %c1_index = arith.constant 1 : index - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %84 = arith.constant 900 : index - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> - %50 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> - %59 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %81 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %66 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> - %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %82 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %85:3 = scf.for %arg9 = %c0_index to %84 step %c1_index iter_args(%arg10 = %cst, %arg11 = %59, %arg12 = %81) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { - %130 = arith.index_cast %arg9 : index to i32 - %107 = arith.muli %130, %c32_i32 : i32 - %108 = arith.subi %arg5, %107 : i32 - %109 = tt.splat %108 : i32 -> tensor<1x32xi32, #AL> - %110 = arith.cmpi "slt", %50, %109 : tensor<1x32xi32, #AL> - %111 = tt.broadcast %110 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> - %112 = tt.load %arg11, %111, %cst_0 : tensor<32x32x!tt.ptr, #AL> - %113 = tt.splat %108 : i32 -> tensor<32x1xi32, #AL> - %114 = arith.cmpi "slt", %66, %113 : tensor<32x1xi32, #AL> - %115 = tt.broadcast %114 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> - %116 = tt.load %arg12, %115, %cst_0 : tensor<32x32x!tt.ptr, #AL> - %117 = triton_gpu.convert_layout %112 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> - %118 = triton_gpu.convert_layout %116 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> - %119 = tt.dot %117, %118, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> - %131 = arith.index_cast %arg9 : index to i32 - %120 = arith.addi %131, %c1_i32 : i32 - %121 = arith.muli %120, %c32_i32 : i32 - %122 = tt.splat %121 : i32 -> tensor<32x32xi32, #AL> - %123 = tt.addptr %60, %122 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - %124 = arith.muli %121, %arg7 : i32 - %125 = tt.splat %124 : i32 -> tensor<32x32xi32, #AL> - %126 = tt.addptr %82, %125 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - scf.yield %119, %123, %126 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> - } - tt.return %85#0 : tensor<32x32xf32, #C> -} - -// AMD-LABEL: tt.func @cross_iter_dep -// TODO: enable pipelining with distance of 2 -// AMD-NOT: triton_gpu.local_load -// AMD: scf.for -// AMD: scf.yield -tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}, - %arg3: i32 {tt.divisibility = 16 : i32}, - %arg4: i32 {tt.divisibility = 16 : i32}, - %arg5: i32 {tt.divisibility = 16 : i32}, - %arg6: i32 {tt.divisibility = 16 : i32}, - %arg7: i32 {tt.divisibility = 16 : i32}, - %arg8: i32 {tt.divisibility = 16 : i32}) -> tensor<32x32xf32, #C> { - %c0_i32 = arith.constant 0 : index - %118 = arith.constant 32 : index - %c1_i32 = arith.constant 1 : index - %c2_i32 = arith.constant 2 : i32 - %c32_i32 = arith.constant 32 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #C> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #AL> - %78 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %110 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %112 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %113 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %116 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %65 = tt.splat %arg3 : i32 -> tensor<1x32xi32, #AL> - %88 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #AL> - %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #AL> - %119:5 = scf.for %arg9 = %c0_i32 to %118 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %78, %arg12 = %110, %arg13 = %113, %arg14 = %116) -> (tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>) { - %161 = arith.index_cast %arg9 : index to i32 - %141 = arith.muli %161, %c32_i32 : i32 - %142 = arith.subi %arg5, %141 : i32 - %143 = tt.splat %142 : i32 -> tensor<1x32xi32, #AL> - %144 = arith.cmpi "slt", %65, %143 : tensor<1x32xi32, #AL> - %145 = tt.broadcast %144 : tensor<1x32xi1, #AL> -> tensor<32x32xi1, #AL> - %146 = tt.load %arg11, %145, %cst_1 : tensor<32x32x!tt.ptr, #AL> - %147 = tt.splat %142 : i32 -> tensor<32x1xi32, #AL> - %148 = arith.cmpi "slt", %88, %147 : tensor<32x1xi32, #AL> - %149 = tt.broadcast %148 : tensor<32x1xi1, #AL> -> tensor<32x32xi1, #AL> - %150 = tt.load %arg12, %149, %cst_1 : tensor<32x32x!tt.ptr, #AL> - %151 = triton_gpu.convert_layout %146 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> - %152 = triton_gpu.convert_layout %150 : tensor<32x32xf32, #AL> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> - %153 = tt.dot %151, %152, %arg10 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 1}>> -> tensor<32x32xf32, #C> - %162 = arith.index_cast %arg9 : index to i32 - %154 = arith.addi %162, %c2_i32 : i32 - %155 = arith.muli %154, %c32_i32 : i32 - %156 = tt.splat %155 : i32 -> tensor<32x32xi32, #AL> - %157 = tt.addptr %80, %156 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - %158 = arith.muli %155, %arg7 : i32 - %159 = tt.splat %158 : i32 -> tensor<32x32xi32, #AL> - %160 = tt.addptr %112, %159 : tensor<32x32x!tt.ptr, #AL>, tensor<32x32xi32, #AL> - scf.yield %153, %arg13, %arg14, %157, %160 : tensor<32x32xf32, #C>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL>, tensor<32x32x!tt.ptr, #AL> - } - tt.return %119#0 : tensor<32x32xf32, #C> -} - -// AMD-LABEL: tt.func @dep_arg_two_uses -// AMD: tt.expand_dims -// AMD: tt.expand_dims -// AMD: tt.expand_dims %arg5 -// AMD-NEXT: tt.expand_dims %arg5 -// AMD: %[[PTR0:.*]] = tt.splat %arg6 -// AMD: %[[PTR1:.*]] = tt.addptr %[[PTR0]] -// AMD-NEXT: tt.load %[[PTR1]] -tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, - %arg1: !tt.ptr {tt.divisibility = 16 : i32}, - %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { - %23 = arith.constant 100 : index - %c64 = arith.constant 64 : i64 - %56 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %57 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %58 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %83 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %85 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> - %86 = tt.splat %c64 : i64 -> tensor<1x32xi64, #AL> - %68 = tt.splat %arg0 : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %c32_index = arith.constant 32 : index - %c32_i32 = arith.index_cast %c32_index : index to i32 - %80 = tt.splat %arg2 : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %cst_6 = arith.constant dense<0.000000e+00> : tensor<32x128xf32, #BL> - %88 = arith.truncf %cst_6 : tensor<32x128xf32, #BL> to tensor<32x128xf16, #BL> - %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #C> - %90 = tt.splat %c64 : i64 -> tensor<32x128xi64, #BL> - %92 = tt.addptr %arg1, %c32_i32 : !tt.ptr, i32 - %c0_index = arith.constant 0 : index - %91:5 = scf.for %arg19 = %c0_index to %23 step %c32_index iter_args(%arg20 = %68, %arg21 = %83, %arg22 = %92, %arg23 = %cst, %arg24 = %80) -> (tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL>) { - %1750 = arith.subi %23, %arg19 : index - %175 = arith.index_cast %1750 : index to i32 - %176 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %177 = tt.splat %175 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %178 = arith.cmpi "slt", %57, %176 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %179 = arith.cmpi "slt", %58, %177 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #BL}>> - %180 = tt.expand_dims %178 {axis = 0 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi1, #AL> - %181 = tt.expand_dims %179 {axis = 1 : i32} : tensor<32xi1, #triton_gpu.slice<{dim = 1, parent = #BL}>> -> tensor<32x1xi1, #BL> - %182 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> - %183 = tt.expand_dims %arg21 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> -> tensor<1x32xi32, #AL> - %184 = arith.extsi %182 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> - %185 = arith.extsi %183 : tensor<1x32xi32, #AL> to tensor<1x32xi64, #AL> - %186 = arith.muli %184, %85 : tensor<1x32xi64, #AL> - %187 = arith.muli %185, %86 : tensor<1x32xi64, #AL> - %188 = tt.broadcast %186 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> - %189 = tt.broadcast %187 : tensor<1x32xi64, #AL> -> tensor<128x32xi64, #AL> - %190 = tt.addptr %arg20, %188 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> - %191 = tt.addptr %arg20, %189 : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi64, #AL> - %192 = tt.broadcast %180 : tensor<1x32xi1, #AL> -> tensor<128x32xi1, #AL> - %193 = tt.load %191, %192 : tensor<128x32x!tt.ptr, #AL> - %194 = tt.splat %arg22 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %195 = tt.addptr %194, %56 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %196 = tt.load %195 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #AL}>> - %197 = tt.addptr %arg22, %c32_i32 : !tt.ptr, i32 - %198 = tt.broadcast %181 : tensor<32x1xi1, #BL> -> tensor<32x128xi1, #BL> - %199 = tt.load %arg24, %198, %88 : tensor<32x128x!tt.ptr, #BL> - %200 = triton_gpu.convert_layout %193 : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> - %201 = triton_gpu.convert_layout %199 : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> - %202 = tt.dot %200, %201, %arg23 : tensor<128x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth = 2}>> * tensor<32x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth = 2}>> -> tensor<128x128xf32, #C> - %203 = tt.addptr %arg24, %90 : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi64, #BL> - scf.yield %190, %196, %197, %202, %203 : tensor<128x32x!tt.ptr, #AL>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #AL}>>, !tt.ptr, tensor<128x128xf32, #C>, tensor<32x128x!tt.ptr, #BL> - } - tt.return %91#3 : tensor<128x128xf32, #C> -} -} // end module - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -// AMD-LABEL: tt.func @load_two_users - tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> - %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // AMD: triton_gpu.local_store - // AMD: scf.for - // AMD: tt.dot - // AMD: tt.dot - // AMD: tt.load - // AMD: triton_gpu.local_store - // AMD: scf.yield - - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { - %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } - tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -// AMD-LABEL: tt.func @load_two_users_incompatible_layouts - tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { - %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %c0_i32 = arith.constant 0 : i32 - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 - %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 - %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> - %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // AMD-NOT: triton_gpu.local_store - // AMD: scf.for - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { - %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> - %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> - %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> - %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> - %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } - tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> - } -} - -// ----- - -// AMD-LABEL: tt.func public @nested_loops -// AMD: scf.for -// AMD: triton_gpu.local_alloc -// AMD-NOT: triton_gpu.local_alloc -// AMD: scf.for -// AMD: scf.yield -// AMD-DIS: scf.yield -// -// The following code has the structure: -// -// ``` -// for { -// %a = load() -// for { -// %b = load() -// dot(%a, %b) -// } -// } -// ``` -// -// Only the outer for should be pipelined. The regression this tests -// causes an assertion to fail while pipelining the outer `for`, in -// particular while predicating the operations scheduled to be emitted -// in the prologue. -// -// We AMD that there is no allocation before the first occurrence of -// scf.for because that would mean that the first load `%a = load()` -// would be pipelined. -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst_0 = arith.constant dense<320> : tensor<32x1xi32, #blocked> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %c10_i32 = arith.constant 10 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = arith.muli %2, %cst_0 : tensor<32x1xi32, #blocked> - %4 = tt.splat %arg1 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %5 = tt.addptr %4, %3 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %6 = tt.broadcast %5 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - %8 = tt.splat %arg3 : !tt.ptr -> tensor<32x1x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { - %9 = arith.muli %arg4, %c32_i32 : i32 - %10 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %11 = tt.splat %9 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = arith.addi %10, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %13 = arith.addi %11, %1 : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %14 = tt.expand_dims %12 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %15 = tt.broadcast %14 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %16 = tt.addptr %6, %15 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %17 = tt.load %16 : tensor<32x32x!tt.ptr, #blocked> - %18 = tt.expand_dims %13 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %19 = arith.muli %18, %cst_0 : tensor<32x1xi32, #blocked> - %20 = tt.addptr %7, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %21 = tt.broadcast %20 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - %22 = tt.addptr %8, %19 : tensor<32x1x!tt.ptr, #blocked>, tensor<32x1xi32, #blocked> - %23 = tt.broadcast %22 : tensor<32x1x!tt.ptr, #blocked> -> tensor<32x32x!tt.ptr, #blocked> - scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 : i32 { - %24 = arith.muli %arg5, %c32_i32 : i32 - %25 = tt.splat %24 : i32 -> tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %26 = arith.addi %25, %0 : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %27 = tt.expand_dims %26 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %28 = tt.broadcast %27 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %29 = tt.addptr %21, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %30 = tt.load %29 : tensor<32x32x!tt.ptr, #blocked> - %31 = triton_gpu.convert_layout %30 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %32 = triton_gpu.convert_layout %17 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %33 = tt.dot %31, %32, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %34 = tt.addptr %23, %28 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %35 = triton_gpu.convert_layout %33 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %34, %35 : tensor<32x32x!tt.ptr, #blocked> - } - } - tt.return - } -} // end module - -// ----- - -// AMD-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de -// AMD-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> - %c64_i32 = arith.constant 64 : i32 - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c64_i32 : i32 - %2 = tt.get_program_id y : i32 - %3 = tt.load %arg3 : !tt.ptr - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> - %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> - %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> - %11 = arith.extsi %arg5 : i32 to i64 - %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> - %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> - %14 = arith.muli %2, %arg5 : i32 - %15 = arith.extsi %14 : i32 to i64 - %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> - %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> - %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> - %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> - %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> - %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> - %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> - %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> - %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> - %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> - %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> - %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> - %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> - %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> - %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> - %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> - %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> - %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> - %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> - %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> - %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> - %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> - %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> - %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> - %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> - %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> - %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> - %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> - %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> - %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> - %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> - %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> - %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> - %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> - %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> - %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> - %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> - %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> - %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { - %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> - %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> - %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> - %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> - %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> - scf.yield %79 : tensor<64x32xf32, #mma> - } - %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> - %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> - %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> - %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> - %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> - %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> - tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> - tt.return - } -} // end module - -// ----- -// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> -// AMD-LABEL: tt.func @indirect_load_shared_layout -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// AMD: } - -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} -} - - -// ----- - -// AMD-LABEL: @kernel_yield_constant -// AMD: tt.load -// AMD: triton_gpu.memdesc_subview -// AMD: triton_gpu.local_store -// AMD: scf.for -// AMD: tt.load -// AMD: triton_gpu.memdesc_subview -// AMD: triton_gpu.local_store -// AMD: tt.return -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "cuda:86", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @kernel_yield_constant(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %cst1 = arith.constant dense<1.000000e+00> : tensor<32x32xf32, #mma> - %c0_i32 = arith.constant 0 : i32 - %c1_i32 = arith.constant 1 : i32 - %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #blocked> - %c32_i32 = arith.constant 32 : i32 - %c31_i32 = arith.constant 31 : i32 - %cst_1 = arith.constant dense<2.000000e+00> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %0 = tt.get_program_id x : i32 - %7 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %12 = arith.addi %arg4, %c31_i32 : i32 - %13 = arith.divsi %12, %c32_i32 : i32 - %14 = tt.expand_dims %7 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %22 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %34 = tt.splat %arg1 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %42 = scf.for %arg7 = %c0_i32 to %13 step %c1_i32 iter_args(%arg8 = %cst) -> (tensor<32x32xf32, #mma>) : i32 { - %43 = arith.muli %arg7, %c32_i32 : i32 - %44 = arith.muli %43, %arg5 : i32 - %45 = tt.splat %44 : i32 -> tensor<32x32xi32, #blocked> - %46 = tt.addptr %22, %45 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %47 = arith.subi %arg4, %43 : i32 - %48 = tt.splat %47 : i32 -> tensor<32x1xi32, #blocked> - %49 = arith.cmpi slt, %14, %48 : tensor<32x1xi32, #blocked> - %50 = tt.broadcast %49 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked> - %51 = tt.load %46, %50, %cst_0 : tensor<32x32x!tt.ptr, #blocked> - %52 = triton_gpu.convert_layout %51 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %53 = tt.dot %cst_1, %52, %arg8 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %54 = triton_gpu.convert_layout %53 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %34, %54 : tensor<32x32x!tt.ptr, #blocked> - scf.yield %cst1 : tensor<32x32xf32, #mma> - } - tt.return - } -} - - -// ----- - -// AMD-LABEL: tt.func public @add_kernel -// AMD: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} -// AMD: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] -// AMD: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} -// AMD: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} -// AMD: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// AMD: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] -// AMD: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] -// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] -// AMD: scf.for -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 - %c0_i32 = arith.constant 0 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { - %7 = arith.addi %1, %arg4 : i32 - %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> - %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> - %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> - %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %12 = tt.load %11, %10 : tensor<1024x!tt.ptr, #blocked> - %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %14 = tt.load %13, %10 : tensor<1024x!tt.ptr, #blocked> - %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> - %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} - - -// ----- - -// AMD-LABEL: tt.func public @nested_loops -// AMD-NOT: triton_gpu.local_alloc -// AMD: scf.for -// AMD: triton_gpu.local_alloc -// AMD: scf.for -// AMD: triton_gpu.local_load -// AMD: tt.dot -// AMD: triton_gpu.local_store -// AMD: scf.yield -// AMD: triton_gpu.local_dealloc - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [2, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> -#shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> -#shared1 = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @nested_loops(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<16> : tensor<16x1xi32, #blocked> - %0 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16x1xi32, #blocked> - %2 = arith.muli %1, %cst_0 : tensor<16x1xi32, #blocked> - %3 = tt.splat %arg0 : !tt.ptr -> tensor<16x1x!tt.ptr, #blocked> - %4 = tt.addptr %3, %2 : tensor<16x1x!tt.ptr, #blocked>, tensor<16x1xi32, #blocked> - %5 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %6 = tt.expand_dims %5 {axis = 0 : i32} : tensor<16xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x16xi32, #blocked> - %7 = tt.broadcast %4 : tensor<16x1x!tt.ptr, #blocked> -> tensor<16x16x!tt.ptr, #blocked> - %8 = tt.broadcast %6 : tensor<1x16xi32, #blocked> -> tensor<16x16xi32, #blocked> - %9 = tt.addptr %7, %8 : tensor<16x16x!tt.ptr, #blocked>, tensor<16x16xi32, #blocked> - scf.for %arg1 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %10 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %11 = triton_gpu.local_alloc %10 : (tensor<16x16xf32, #blocked>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> - %12 = tt.trans %11 {order = array} : !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> - %13 = triton_gpu.local_load %12 : !tt.memdesc<16x16xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - scf.for %arg2 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %14 = tt.load %9 : tensor<16x16x!tt.ptr, #blocked> - %15 = triton_gpu.convert_layout %14 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %16 = tt.dot %15, %13, %cst : tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<16x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<16x16xf32, #mma> - %17 = triton_gpu.convert_layout %16 : tensor<16x16xf32, #mma> -> tensor<16x16xf32, #blocked> - tt.store %9, %17 : tensor<16x16x!tt.ptr, #blocked> - } - } - tt.return - } -} - -// ----- - -// This test triggered some failure in the verifier, so we only -// included a simple AMD for the kernel name. -// AMD-LABEL: @load_convert_layout -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, - %76: index, - %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, - %75: tensor<16x!tt.ptr, #BLs1>, - %78: tensor<16x16xi32, #AL> {tt.constancy=16: i32, tt.divisibility=16: i32}, - %60: tensor<16x16x!tt.ptr, #BL> {tt.divisibility=16: i32, tt.contiguity=16 : i32}) -> tensor<16x16xf32, #C>{ - %1 = tt.make_range {end = 16 : i32, start = 0 : i32} : tensor<16xi32, #BLs1> - %cst = arith.constant dense<0.000000e+00> : tensor<16x16xf32, #C> - %cst_0 = arith.constant dense<2> : tensor<16xi32, #BLs1> - %c4_i32 = arith.constant 4 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c0_i64 = arith.constant 0 : i64 - %c1_i32 = arith.constant 1 : i32 - %c1_i32_splat = tt.splat %c1_i32 : i32 -> tensor<16xi32, #BLs1> - %15 = arith.cmpi slt, %1, %cst_0 : tensor<16xi32, #BLs1> - %79:3 = scf.for %arg18 = %c0 to %76 step %c1 iter_args(%arg19 = %cst, %arg20 = %49, %arg21 = %75) -> (tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1>) { - %82 = tt.load %arg20 : tensor<16x16x!tt.ptr, #AL> - %83 = tt.load %arg21, %15 : tensor<16x!tt.ptr, #BLs1> - %84 = tt.expand_dims %83 {axis=1: i32}: tensor<16xi64, #BLs1> -> tensor<16x1xi64, #BL> - %850 = tt.broadcast %84 : tensor<16x1xi64, #BL> -> tensor<16x16xi64, #BL> - %85 = arith.muli %77, %850 : tensor<16x16xi64, #BL> - %86 = tt.addptr %60, %85 : tensor<16x16x!tt.ptr, #BL>, tensor<16x16xi64, #BL> - %87 = tt.load %86 : tensor<16x16x!tt.ptr, #BL> - %88 = triton_gpu.convert_layout %82 : tensor<16x16xf16, #AL> -> tensor<16x16xf16, #A> - %89 = triton_gpu.convert_layout %87 : tensor<16x16xf16, #BL> -> tensor<16x16xf16, #B> - %90 = tt.dot %88, %89, %arg19 : tensor<16x16xf16, #A> * tensor<16x16xf16, #B> -> tensor<16x16xf32, #C> - %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> - %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> - scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } {tt.num_stages = 3 : i32} - tt.return %79#0 : tensor<16x16xf32, #C> -} -} - - -// ----- - -// This test captured some ICE in MatmulLoopPipeline pass, so we only -// included a simple AMD for the kernel name. -// AMD-LABEL: @matmul_indirect_pipeline -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 2], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @matmul_indirect_pipeline(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma> - %c1_i32 = arith.constant 1 : i32 - %c2_i32 = arith.constant 2 : i32 - %c0_i32 = arith.constant 0 : i32 - %0 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %1 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %2 = tt.expand_dims %1 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked> - %3 = tt.expand_dims %0 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> - %4 = tt.broadcast %2 : tensor<32x1xi32, #blocked> -> tensor<32x32xi32, #blocked> - %5 = tt.broadcast %3 : tensor<1x32xi32, #blocked> -> tensor<32x32xi32, #blocked> - %6 = arith.addi %4, %5 : tensor<32x32xi32, #blocked> - %7 = tt.splat %arg0 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %8 = tt.addptr %7, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %9 = tt.load %8 : tensor<32x32x!tt.ptr, #blocked> - %10 = tt.splat %arg3 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked> - %11 = tt.addptr %10, %6 : tensor<32x32x!tt.ptr, #blocked>, tensor<32x32xi32, #blocked> - %12 = tt.splat %arg1 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %13 = tt.addptr %12, %0 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %14 = tt.splat %arg2 : !tt.ptr -> tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - scf.for %arg4 = %c0_i32 to %c2_i32 step %c1_i32 : i32 { - %15 = tt.load %13 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %16 = tt.addptr %14, %15 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>>, tensor<32xi64, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %17 = tt.load %16 : tensor<32x!tt.ptr, #triton_gpu.slice<{dim = 0, parent = #blocked}>> - %18 = tt.expand_dims %17 {axis = 0 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xf32, #blocked> - %19 = tt.broadcast %18 : tensor<1x32xf32, #blocked> -> tensor<32x32xf32, #blocked> - %20 = arith.addf %9, %19 : tensor<32x32xf32, #blocked> - %21 = triton_gpu.convert_layout %9 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> - %22 = triton_gpu.convert_layout %20 : tensor<32x32xf32, #blocked> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> - %23 = tt.dot %21, %22, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> - %24 = triton_gpu.convert_layout %23 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> - tt.store %11, %24 : tensor<32x32x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} - -// ----- - -// AMD-LABEL: @dont_pipeline_128x1 -// AMD-NOT: local_load{{.*}}128x1 -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { - tt.func public @dont_pipeline_128x1(%arg6: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { - %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %c128_i32 = arith.constant 128 : i32 - %c0_i32 = arith.constant 0 : i32 - %c64_i32 = arith.constant 64 : i32 - %cst_4 = arith.constant dense<-1.000000e+30> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - - %99:1 = scf.for %arg25 = %c0_i32 to %c128_i32 step %c64_i32 iter_args(%arg31 = %cst_4) -> (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) : i32 { - %94 = tt.splat %arg6 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked> - %151 = tt.load %94 : tensor<128x1x!tt.ptr, #blocked> - %161 = triton_gpu.convert_layout %151 : tensor<128x1xi32, #blocked> -> tensor<128x1xi32, #mma> - %162 = tt.broadcast %161 : tensor<128x1xi32, #mma> -> tensor<128x64xi32, #mma> - %170 = arith.sitofp %162 : tensor<128x64xi32, #mma> to tensor<128x64xf32, #mma> - - %173 = "tt.reduce"(%170) <{axis = 1 : i32}> ({ - ^bb0(%arg33: f32, %arg34: f32): - %207 = arith.maxnumf %arg33, %arg34 : f32 - tt.reduce.return %207 : f32 - }) : (tensor<128x64xf32, #mma>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %175 = arith.maxnumf %arg31, %173 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - - %201 = arith.truncf %170 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> - %202 = triton_gpu.convert_layout %201 : tensor<128x64xf16, #mma> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> - - %192 = arith.constant dense<0.> : tensor<128x64xf32, #mma> - %203 = arith.constant dense<0.> : tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> - %204 = tt.dot %202, %203, %192 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> - - scf.yield %175 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - } - tt.return - } -} - -// ----- - -// AMD that the dependencies across ops of different nesting does not cause crash or -// incorrect schedule that fails to pipeline. -// AMD-LABEL: @matmul_nested_ops -// AMD: triton_gpu.local_load - -#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}> -#ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> -#BLs0 = #triton_gpu.slice<{parent=#BL, dim=0}> -#BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> -#C = #triton_gpu.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1]}> -#A = #triton_gpu.dot_op<{opIdx = 0, parent = #C, kWidth=2}> -#B = #triton_gpu.dot_op<{opIdx = 1, parent = #C, kWidth=2}> - -module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.target" = "hip:gfx942"} { -tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, - %A : !tt.ptr {tt.divisibility = 16 : i32}, - %B : !tt.ptr {tt.divisibility = 16 : i32}, - %ext : index) -> tensor<128x128xf32, #C> { - // A ptrs - %a_ptr_splat = tt.splat %A : !tt.ptr -> tensor<128x32x!tt.ptr, #AL> - %a_tmp0 = tt.make_range {end = 32: i32, start = 0: i32} : tensor<32xi32, #ALs0> - %a_tmp1 = tt.expand_dims %a_tmp0 {axis = 0 : i32} : tensor<32xi32, #ALs0> -> tensor<1x32xi32, #AL> - %a_offs = tt.broadcast %a_tmp1 : tensor<1x32xi32, #AL> -> tensor<128x32xi32, #AL> - %a_ptr_init = tt.addptr %a_ptr_splat, %a_offs : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - // B ptrs - %b_ptr_splat = tt.splat %B : !tt.ptr -> tensor<32x128x!tt.ptr, #BL> - %b_tmp0 = tt.make_range {end = 128: i32, start = 0: i32} : tensor<128xi32, #BLs0> - %b_tmp1 = tt.expand_dims %b_tmp0 {axis = 0 : i32} : tensor<128xi32, #BLs0> -> tensor<1x128xi32, #BL> - %b_offs = tt.broadcast %b_tmp1 : tensor<1x128xi32, #BL> -> tensor<32x128xi32, #BL> - %b_ptr = tt.addptr %b_ptr_splat, %b_offs : tensor<32x128x!tt.ptr, #BL>, tensor<32x128xi32, #BL> - - %a_mask = arith.constant dense : tensor<128x32xi1, #AL> - %a_other = arith.constant dense<0.00e+00> : tensor<128x32xf16, #AL> - %b_mask = arith.constant dense : tensor<32x128xi1, #BL> - %b_other = arith.constant dense<0.00e+00> : tensor<32x128xf16, #BL> - %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf32, #C> - - %a_off = arith.constant dense<4> : tensor<128x32xi32, #AL> - - %b_ = tt.load %b_ptr, %b_mask, %b_other : tensor<32x128x!tt.ptr, #BL> - %b = triton_gpu.convert_layout %b_ : tensor<32x128xf16, #BL> -> tensor<32x128xf16, #B> - - %loop:2 = scf.for %iv = %lb to %ub step %step iter_args(%a_ptr = %a_ptr_init, %prev_c = %c_init) -> (tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C>) { - %cnd = arith.cmpi slt, %iv, %ext : index - %inc_a_ptr = scf.if %cnd -> (tensor<128x32x!tt.ptr, #AL>) { - %a_ptr_ = tt.addptr %a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - scf.yield %a_ptr_ : tensor<128x32x!tt.ptr, #AL> - } else { - scf.yield %a_ptr : tensor<128x32x!tt.ptr, #AL> - } - %a_ = tt.load %inc_a_ptr : tensor<128x32x!tt.ptr, #AL> - %a = triton_gpu.convert_layout %a_ : tensor<128x32xf16, #AL> -> tensor<128x32xf16, #A> - - %c = tt.dot %a, %b, %prev_c : tensor<128x32xf16, #A> * tensor<32x128xf16, #B> -> tensor<128x128xf32, #C> - - %next_a_ptr = tt.addptr %inc_a_ptr, %a_off : tensor<128x32x!tt.ptr, #AL>, tensor<128x32xi32, #AL> - scf.yield %next_a_ptr, %c : tensor<128x32x!tt.ptr, #AL>, tensor<128x128xf32, #C> - } - tt.return %loop#1: tensor<128x128xf32, #C> -} -} - -// ----- - -// Pipeline the if ops at the beginning and the end of the loop -#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 64, 16]}> -#mma1 = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - // AMD-LABEL: dot_prologue_epilogue - // AMD-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} - tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { - %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> - %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> - %c0_i32 = arith.constant 0 : i32 - %cst_0 = arith.constant dense<0> : tensor<1x16xi32, #blocked> - %cst_1 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> - %c0_i64 = arith.constant 0 : i64 - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1> - %c1_i32 = arith.constant 1 : i32 - %c8_i32 = arith.constant 8 : i32 - %2 = tt.splat %arg1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> - %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> - %6 = tt.broadcast %2 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> - %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> - %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %10 = tt.splat %arg0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> - %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> - %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> - %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> - %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> - %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // AMD: %[[C0:.*]] = arith.constant 0 : i32 - // AMD: scf.for %[[IND_VAR:.*]] = %[[C0]] to - // AMD-NOT: load - // AMD: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // AMD: scf.if %[[CND]] - // AMD: dot - // AMD: scf.if %[[CND]] - // AMD: arith.mulf - // AMD: scf.yield - // AMD-NOT: tt.addptr - // AMD: scf.yield - %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { - %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> - %cnd = arith.cmpi slt, %arg3, %ext : i32 - %inc_ptr = scf.if %cnd -> tensor<64x16x!tt.ptr, #blocked> { - %ptr = tt.addptr %arg5, %inc : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - scf.yield %ptr : tensor<64x16x!tt.ptr, #blocked> - } else { - scf.yield %arg5 : tensor<64x16x!tt.ptr, #blocked> - } - %18 = tt.load %inc_ptr : tensor<64x16x!tt.ptr, #blocked> - %19 = triton_gpu.local_alloc %9 : (tensor<128x64xf16, #blocked1>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %20 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> - %acc = triton_nvidia_gpu.warp_group_dot %19, %20, %arg4 : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x16xf16, #shared1, #triton_gpu.shared_memory> -> tensor<128x16xf32, #mma1> - %acc_ = scf.if %cnd -> (tensor<128x16xf32, #mma1>) { - %acc_zero = arith.mulf %acc, %cst_2 : tensor<128x16xf32, #mma1> - scf.yield %acc_zero : tensor<128x16xf32, #mma1> - } else { - scf.yield %acc : tensor<128x16xf32, #mma1> - } - %22 = tt.addptr %arg5, %cst : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - %23 = tt.addptr %arg6, %cst2 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %acc_, %22, %23 : tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1> - } - tt.return %17#0 : tensor<128x16xf32, #mma1> - } -} - -// ----- - -// AMD-LABEL: @masked_add_kernel -// AMD: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> -// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// AMD: scf.for -// AMD: arith.select -// AMD: arith.select -// AMD: arith.addf -// AMD: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] -// AMD: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] - -#blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}> -module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { - tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 - %c0_i32 = arith.constant 0 : i32 - %c1016800_i32 = arith.constant 1016800 : i32 - %cst = arith.constant dense<0xFF800000> : tensor<1024xf32, #blocked> - %0 = tt.get_program_id x : i32 - %1 = arith.muli %0, %c1016800_i32 : i32 - %2 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32, #blocked> - %3 = tt.splat %arg3 : i32 -> tensor<1024xi32, #blocked> - %4 = tt.splat %arg0 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %5 = tt.splat %arg1 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - %6 = tt.splat %arg2 : !tt.ptr -> tensor<1024x!tt.ptr, #blocked> - scf.for %arg4 = %c0_i32 to %c1016800_i32 step %c1024_i32 : i32 { - %7 = arith.addi %1, %arg4 : i32 - %8 = tt.splat %7 : i32 -> tensor<1024xi32, #blocked> - %9 = arith.addi %8, %2 : tensor<1024xi32, #blocked> - %10 = arith.cmpi slt, %9, %3 : tensor<1024xi32, #blocked> - %11 = tt.addptr %4, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %12 = tt.load %11, %10, %cst : tensor<1024x!tt.ptr, #blocked> - %13 = tt.addptr %5, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - %14 = tt.load %13, %10, %cst : tensor<1024x!tt.ptr, #blocked> - %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> - %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> - tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> - } {tt.num_stages = 3 : i32} - tt.return - } -} diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir new file mode 100644 index 000000000000..323085009e13 --- /dev/null +++ b/test/TritonGPU/loop-pipeline-hip.mlir @@ -0,0 +1,162 @@ +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s + +#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { +// CHECK-LABEL: tt.func @load_two_users + tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { + %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> + %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> + %c0_i64 = arith.constant 0 : i64 + %c0_i32 = arith.constant 0 : i32 + %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> + %c1_i32 = arith.constant 1 : i32 + %c8_i32 = arith.constant 8 : i32 + %0 = tt.addptr %arg0, %c0_i64 : !tt.ptr, i64 + %1 = tt.addptr %arg1, %c0_i64 : !tt.ptr, i64 + %2 = tt.splat %1 : !tt.ptr -> tensor<128x1x!tt.ptr, #blocked1> + %3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr, #blocked1>, tensor<128x1xi32, #blocked1> + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %6 = tt.broadcast %3 : tensor<128x1x!tt.ptr, #blocked1> -> tensor<128x64x!tt.ptr, #blocked1> + %7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1> + %8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %9 = tt.load %8 : tensor<128x64x!tt.ptr, #blocked1> + %10 = tt.splat %0 : !tt.ptr -> tensor<1x16x!tt.ptr, #blocked> + %11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr, #blocked>, tensor<1x16xi32, #blocked> + %12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %14 = tt.broadcast %11 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> + %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> + %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> + // CHECK: triton_gpu.local_store + // CHECK: scf.for + // CHECK: tt.dot + // CHECK: tt.dot + // CHECK: tt.load + // CHECK: triton_gpu.local_store + // CHECK: scf.yield + + %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { + %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> + %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma> + %22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma> + %23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> + %24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> + %25 = tt.trans %24 {order=array} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> + %26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> + %27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma> + scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } + tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma> + } +} + +// ----- + +// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de +// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}> +#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> +module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { + tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: !tt.ptr {tt.divisibility = 16 : i32}, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} { + %cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma> + %c64_i32 = arith.constant 64 : i32 + %c0_i32 = arith.constant 0 : i32 + %c32_i32 = arith.constant 32 : i32 + %0 = tt.get_program_id x : i32 + %1 = arith.muli %0, %c64_i32 : i32 + %2 = tt.get_program_id y : i32 + %3 = tt.load %arg3 : !tt.ptr + %4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + %7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked> + %8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked> + %9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked> + %10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked> + %11 = arith.extsi %arg5 : i32 to i64 + %12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked> + %13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked> + %14 = arith.muli %2, %arg5 : i32 + %15 = arith.extsi %14 : i32 to i64 + %16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked> + %17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked> + %18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked> + %21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1> + %22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked> + %23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1> + %24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked> + %25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1> + %26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked> + %27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked> + %28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1> + %29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked> + %30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked> + %31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1> + %33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1> + %34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1> + %35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1> + %36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1> + %37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1> + %38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1> + %39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1> + %40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1> + %42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1> + %43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> + %44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> + %45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1> + %46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked> + %47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1> + %48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked> + %49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1> + %50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked> + %51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1> + %53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked> + %54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1> + %55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1> + %56 = tt.splat %arg0 : !tt.ptr -> tensor<64x64x!tt.ptr, #blocked> + %57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr, #blocked>, tensor<64x64xi64, #blocked> + %58 = tt.splat %arg1 : !tt.ptr -> tensor<32x64x!tt.ptr, #blocked1> + %59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr, #blocked1>, tensor<32x64xi64, #blocked1> + %60 = tt.splat %arg2 : !tt.ptr -> tensor<32x32x!tt.ptr, #blocked1> + %61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr, #blocked1>, tensor<32x32xi64, #blocked1> + %62 = tt.load %57 : tensor<64x64x!tt.ptr, #blocked> + %63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 { + %70 = tt.load %59 : tensor<32x64x!tt.ptr, #blocked1> + %71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> + %73 = tt.trans %72 {order=array} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> + %74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + %76 = tt.load %61 : tensor<32x32x!tt.ptr, #blocked1> + %77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> + %78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> + %79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma> + scf.yield %79 : tensor<64x32xf32, #mma> + } + %64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked> + %65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked> + %66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked> + %67 = tt.splat %arg4 : !tt.ptr -> tensor<64x32x!tt.ptr, #blocked> + %68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr, #blocked>, tensor<64x32xi64, #blocked> + %69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked> + tt.store %68, %69 : tensor<64x32x!tt.ptr, #blocked> + tt.return + } +} // end module diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 9967fa85239c..7666637cccb7 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -1,5 +1,6 @@ // RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s // RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s --check-prefix=CHECK-NOCANON +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefix=AMD // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 @@ -55,6 +56,49 @@ // CHECK-DAG: %[[NEXT_B:.*]] = triton_gpu.memdesc_subview %{{.+}}[%[[EXT_IDX_3]], // CHECK-DAG: triton_gpu.async_wait {{.*}} {num = 2 : i32} // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_A]], %[[NEXT_B]] + +// AMD-LABEL: tt.func @matmul_loop +// AMD: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// AMD: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} +// AMD: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} +// AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} +// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] +// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] +// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } + +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] + module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32} { tt.func @matmul_loop(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, @@ -146,6 +190,50 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, // CHECK: triton_gpu.memdesc_subview %[[BBUFFER]][%[[CONSTANT_1]], %[[CONSTANT_0]], %[[CONSTANT_0]]] // CHECK: triton_gpu.async_copy_global_to_local // CHECK scf.yield + +// AMD-LABEL: tt.func @matmul_loop_nested +// AMD: scf.for +// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// AMD: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] +// AMD: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) + +// AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] +// AMD: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} +// AMD: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} +// AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] +// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] +// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] +// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} +// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] +// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// AMD: scf.yield %{{.*}}#2 +// AMD: } + tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C>{ @@ -216,6 +304,36 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, // CHECK-DAG: %[[NEXT_B:.*]] = triton_gpu.memdesc_subview %{{.+}}[%[[EXT_IDX_3]] // CHECK-DAG: triton_gpu.async_wait {{.*}} {num = 1 : i32} // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_B]] + +// AMD-LABEL: tt.func @matmul_loop_single_pipeline +// AMD: %[[LOAD_10:.*]] = tt.load +// AMD: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] +// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// AMD: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] +// AMD: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} +// AMD: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] +// AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// AMD: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} +// AMD: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} +// AMD: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] +// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] + tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, %B : !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -268,6 +386,67 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // CHECK: %[[NEXT_BUFFER_0:.*]] = tt.addptr {{.*}}, %[[IND_BUFFER_2]] // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_0]] // CHECK: triton_gpu.async_wait {{.*}} {num = 2 : i32} + +// AMD-LABEL: tt.func @indirect_bmm_scalar +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] +// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] +// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] +// AMD: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] +// AMD: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] +// AMD: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] +// AMD: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] +// AMD: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] +// AMD: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] +// AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] +// AMD: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) + +// AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// AMD: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} +// AMD: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} +// AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] +// AMD: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] +// AMD: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] +// AMD: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] +// AMD: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] +// AMD: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] +// AMD: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] +// AMD: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} +// AMD: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] +// AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] +// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// AMD: } + +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -293,7 +472,7 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> %92 = tt.addptr %arg21, %c1_i32 : !tt.ptr, i32 scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, !tt.ptr - } + } {tt.num_stages = 3 : i32} tt.return %79#0 : tensor<16x16xf32, #C> } @@ -313,6 +492,58 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_0]] // CHECK: triton_gpu.async_wait {{.*}} {num = 2 : i32} // CHECK: scf.yield {{.*}}, {{.*}}, {{.*}}, %[[IND_BUFFER_0]] + +// AMD-LABEL: tt.func @indirect_bmm_scalar_dist_one +// AMD: %[[LOAD_0:.*]] = tt.load %{{.*}} +// AMD: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] +// AMD: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] +// AMD: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] +// AMD: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] +// AMD: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] +// AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) + +// AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// AMD: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} +// AMD: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} +// AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] +// AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] +// AMD: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// AMD: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// AMD: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] + tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -365,6 +596,65 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_0]] // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} // CHECK: scf.yield + +// AMD-LABEL: tt.func @indirect_bmm_vector +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] +// AMD: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} +// AMD: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] +// AMD: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: } +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] + tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, %49: tensor<16x16x!tt.ptr, #AL> {tt.divisibility=16: i32, tt.contiguity=2 : i32}, @@ -392,7 +682,7 @@ tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } + } {tt.num_stages = 3 : i32} tt.return %79#0 : tensor<16x16xf32, #C> } @@ -402,6 +692,13 @@ tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 // CHECK: arith.index_cast // CHECK-NOT: arith.addi %[[NEXT_IV]] + +// AMD-LABEL: tt.func @post_load_inv +// AMD: scf.for +// AMD-DAG: %[[IV:.*]] = arith.index_cast +// AMD: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 +// AMD: arith.index_cast +// AMD-NOT: arith.addi %[[NEXT_IV]] tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -457,6 +754,12 @@ tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, // CHECK-NOT: triton_gpu.async_commit_group // CHECK: scf.for // CHECK: scf.yield + +// AMD-LABEL: tt.func @cross_iter_dep +// TODO: enable pipelining with distance of 2 +// AMD-NOT: triton_gpu.local_load +// AMD: scf.for +// AMD: scf.yield tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -517,6 +820,15 @@ tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, // CHECK: %[[PTR0:.*]] = tt.splat %arg6 // CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] // CHECK-NEXT: tt.load %[[PTR1]] + +// AMD-LABEL: tt.func @dep_arg_two_uses +// AMD: tt.expand_dims +// AMD: tt.expand_dims +// AMD: tt.expand_dims %arg5 +// AMD-NEXT: tt.expand_dims %arg5 +// AMD: %[[PTR0:.*]] = tt.splat %arg6 +// AMD: %[[PTR1:.*]] = tt.addptr %[[PTR0]] +// AMD-NEXT: tt.load %[[PTR1]] tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -584,6 +896,7 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: tt.func @load_two_users_incompatible_layouts +// AMD-LABEL: tt.func @load_two_users_incompatible_layouts tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> @@ -613,6 +926,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // check that the load didn't get pipelined. // CHECK-NOT: alloc // CHECK: scf.for + // AMD-NOT: triton_gpu.local_store + // AMD: scf.for %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> @@ -644,6 +959,13 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: triton_gpu.async_copy_global_to_local // CHECK: triton_gpu.async_commit_group // CHECK: scf.yield +// AMD-LABEL: tt.func public @nested_loops +// AMD: scf.for +// AMD: triton_gpu.local_alloc +// AMD-NOT: triton_gpu.local_alloc +// AMD: scf.for +// AMD: scf.yield +// AMD-DIS: scf.yield // // The following code has the structure: // @@ -735,6 +1057,43 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[NEXT_BUFFER_0:.*]] = tt.addptr {{.*}}, %[[IND_BUFFER_4]] // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_0]] // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} + +// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> +// AMD-LABEL: tt.func @indirect_load_shared_layout +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) + +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} +// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: } + #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #BLs1 = #triton_gpu.slice<{parent=#BL, dim=1}> @@ -769,7 +1128,7 @@ tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibilit %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } + } {tt.num_stages = 3 : i32} tt.return %79#0 : tensor<16x16xf32, #C> } } @@ -784,6 +1143,16 @@ tt.func @indirect_load_shared_layout(%77: tensor<16x16xi64, #BL> {tt.divisibilit // CHECK: triton_gpu.async_copy_global_to_local // CHECK: triton_gpu.memdesc_subview // CHECK: tt.return + +// AMD-LABEL: @kernel_yield_constant +// AMD: tt.load +// AMD: triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store +// AMD: scf.for +// AMD: tt.load +// AMD: triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store +// AMD: tt.return #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { @@ -840,6 +1209,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[B1BUFFER:.*]] = triton_gpu.memdesc_subview %[[BBUFFER]][%[[CONSTANT_1]], %[[CONSTANT_0]]] // CHECK: triton_gpu.async_copy_global_to_local {{.*}}, %[[B1BUFFER]] // CHECK: scf.for + +// AMD-LABEL: tt.func public @add_kernel +// AMD: %[[LOAD_11:.*]] = tt.load %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[LOAD_13:.*]] = tt.load %[[ADDPTR_12]], %{{.*}} +// AMD: %[[ADDI_14:.*]] = arith.addi %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[ADDI_14]] +// AMD: %[[ADDI_16:.*]] = arith.addi %[[SPLAT_15]], %{{.*}} +// AMD: %[[CMPI_17:.*]] = arith.cmpi slt, %[[ADDI_16]], %{{.*}} +// AMD: %[[ADDPTR_18:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// AMD: %[[LOAD_19:.*]] = tt.load %[[ADDPTR_18]], %[[CMPI_17]] +// AMD: %[[ADDPTR_20:.*]] = tt.addptr %{{.*}}, %[[ADDI_16]] +// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_20]], %[[CMPI_17]] +// AMD: scf.for #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { @@ -865,7 +1248,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %15 = arith.addf %12, %14 : tensor<1024xf32, #blocked> %16 = tt.addptr %6, %9 : tensor<1024x!tt.ptr, #blocked>, tensor<1024xi32, #blocked> tt.store %16, %15, %10 : tensor<1024x!tt.ptr, #blocked> - }{tt.num_stages = 3 : i32} + } {tt.num_stages = 3 : i32} tt.return } } @@ -906,6 +1289,17 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[COMMIT_2:.*]] = triton_gpu.async_commit_group %[[ASYNC_COPY_5]] // CHECK: scf.yield %[[COMMIT_1]], %[[COMMIT_2]] // CHECK: triton_gpu.local_dealloc %[[BUFFER_1]] + +// AMD-LABEL: tt.func public @nested_loops +// AMD-NOT: triton_gpu.local_alloc +// AMD: scf.for +// AMD: triton_gpu.local_alloc +// AMD: scf.for +// AMD: triton_gpu.local_load +// AMD: tt.dot +// AMD: triton_gpu.local_store +// AMD: scf.yield +// AMD: triton_gpu.local_dealloc #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [2, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 2], instrShape = [16, 8]}> #shared = #triton_gpu.shared<{vec = 4, perPhase = 2, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}> @@ -1020,6 +1414,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // This test triggered some failure in the verifier, so we only // included a simple check for the kernel name. // CHECK-LABEL: @load_convert_layout +// AMD-LABEL: @load_convert_layout #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> @@ -1060,7 +1455,7 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 %91 = tt.addptr %arg20, %78 : tensor<16x16x!tt.ptr, #AL>, tensor<16x16xi32, #AL> %92 = tt.addptr %arg21, %c1_i32_splat : tensor<16x!tt.ptr, #BLs1>, tensor<16xi32, #BLs1> scf.yield %90, %91, %92 : tensor<16x16xf32, #C>, tensor<16x16x!tt.ptr, #AL>, tensor<16x!tt.ptr, #BLs1> - } + } {tt.num_stages = 3 : i32} tt.return %79#0 : tensor<16x16xf32, #C> } } @@ -1071,6 +1466,7 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // This test captured some ICE in MatmulLoopPipeline pass, so we only // included a simple check for the kernel name. // CHECK-LABEL: @matmul_indirect_pipeline +// AMD-LABEL: @matmul_indirect_pipeline #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} { @@ -1106,7 +1502,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : %23 = tt.dot %21, %22, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<32x32xf32, #mma> %24 = triton_gpu.convert_layout %23 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked> tt.store %11, %24 : tensor<32x32x!tt.ptr, #blocked> - } + } {tt.num_stages = 3 : i32} tt.return } } @@ -1115,6 +1511,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : // CHECK-LABEL: @dont_pipeline_128x1 // CHECK-NOT: local_load{{.*}}128x1 +// AMD-LABEL: @dont_pipeline_128x1 +// AMD-NOT: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { @@ -1158,6 +1556,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // incorrect schedule that fails to pipeline. // CHECK-LABEL: @matmul_nested_ops // CHECK: triton_gpu.local_load +// AMD-LABEL: @matmul_nested_ops +// AMD: triton_gpu.local_load #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -1229,6 +1629,8 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK-LABEL: dot_prologue_epilogue // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + // AMD-LABEL: dot_prologue_epilogue + // AMD-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> @@ -1262,6 +1664,17 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: scf.yield // CHECK-NOT: tt.addptr // CHECK: scf.yield + // AMD: %[[C0:.*]] = arith.constant 0 : i32 + // AMD: scf.for %[[IND_VAR:.*]] = %[[C0]] to + // AMD-NOT: load + // AMD: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // AMD: scf.if %[[CND]] + // AMD: dot + // AMD: scf.if %[[CND]] + // AMD: arith.mulf + // AMD: scf.yield + // AMD-NOT: tt.addptr + // AMD: scf.yield %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> %cnd = arith.cmpi slt, %arg3, %ext : i32 @@ -1365,6 +1778,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: %[[B:.*]] = triton_gpu.local_load // CHECK: arith.select {{.*}}, %[[B]], %[[CONSTANT]] +// AMD-LABEL: @masked_add_kernel +// AMD: %[[CONSTANT:.*]] = arith.constant dense<0xFF800000> +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: scf.for +// AMD: arith.select +// AMD: arith.select +// AMD: arith.addf +// AMD: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] +// AMD: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] + #blocked = #triton_gpu.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { tt.func public @masked_add_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 16 : i32}) attributes {noinline = false} { From f3e311e9d960841940f5e39fce0dc3db4f77febb Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 05:34:32 +0000 Subject: [PATCH 27/36] Use COMMON prefix for shared check lines --- test/TritonGPU/loop-pipeline.mlir | 130 ++++++++++-------------------- 1 file changed, 44 insertions(+), 86 deletions(-) diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 7666637cccb7..1246809c37af 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -1,6 +1,6 @@ -// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s +// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize | FileCheck %s --check-prefixes=COMMON,CHECK // RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 | FileCheck %s --check-prefix=CHECK-NOCANON -// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefix=AMD +// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s --check-prefixes=COMMON,AMD // 4 warps // matmul: 128x32 @ 32x128 -> 128x128 @@ -686,19 +686,12 @@ tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 tt.return %79#0 : tensor<16x16xf32, #C> } -// CHECK-LABEL: tt.func @post_load_inv -// CHECK: scf.for -// CHECK-DAG: %[[IV:.*]] = arith.index_cast -// CHECK: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 -// CHECK: arith.index_cast -// CHECK-NOT: arith.addi %[[NEXT_IV]] - -// AMD-LABEL: tt.func @post_load_inv -// AMD: scf.for -// AMD-DAG: %[[IV:.*]] = arith.index_cast -// AMD: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 -// AMD: arith.index_cast -// AMD-NOT: arith.addi %[[NEXT_IV]] +// COMMON-LABEL: tt.func @post_load_inv +// COMMON: scf.for +// COMMON-DAG: %[[IV:.*]] = arith.index_cast +// COMMON: %[[NEXT_IV:.*]] = arith.addi %[[IV]], %c1_i32 : i32 +// COMMON: arith.index_cast +// COMMON-NOT: arith.addi %[[NEXT_IV]] tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -749,17 +742,12 @@ tt.func @post_load_inv(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, tt.return %85#0 : tensor<32x32xf32, #C> } -// CHECK-LABEL: tt.func @cross_iter_dep +// COMMON-LABEL: tt.func @cross_iter_dep // TODO: enable pipelining with distance of 2 -// CHECK-NOT: triton_gpu.async_commit_group -// CHECK: scf.for -// CHECK: scf.yield +// COMMON-NOT: triton_gpu.async_commit_group +// COMMON: scf.for +// COMMON: scf.yield -// AMD-LABEL: tt.func @cross_iter_dep -// TODO: enable pipelining with distance of 2 -// AMD-NOT: triton_gpu.local_load -// AMD: scf.for -// AMD: scf.yield tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, @@ -812,23 +800,14 @@ tt.func @cross_iter_dep(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, tt.return %119#0 : tensor<32x32xf32, #C> } -// CHECK-LABEL: tt.func @dep_arg_two_uses -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims -// CHECK: tt.expand_dims %arg5 -// CHECK-NEXT: tt.expand_dims %arg5 -// CHECK: %[[PTR0:.*]] = tt.splat %arg6 -// CHECK: %[[PTR1:.*]] = tt.addptr %[[PTR0]] -// CHECK-NEXT: tt.load %[[PTR1]] - -// AMD-LABEL: tt.func @dep_arg_two_uses -// AMD: tt.expand_dims -// AMD: tt.expand_dims -// AMD: tt.expand_dims %arg5 -// AMD-NEXT: tt.expand_dims %arg5 -// AMD: %[[PTR0:.*]] = tt.splat %arg6 -// AMD: %[[PTR1:.*]] = tt.addptr %[[PTR0]] -// AMD-NEXT: tt.load %[[PTR1]] +// COMMON-LABEL: tt.func @dep_arg_two_uses +// COMMON: tt.expand_dims +// COMMON: tt.expand_dims +// COMMON: tt.expand_dims %arg5 +// COMMON-NEXT: tt.expand_dims %arg5 +// COMMON: %[[PTR0:.*]] = tt.splat %arg6 +// COMMON: %[[PTR1:.*]] = tt.addptr %[[PTR0]] +// COMMON-NEXT: tt.load %[[PTR1]] tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> { @@ -895,8 +874,7 @@ tt.func @dep_arg_two_uses(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, #shared = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 4, perPhase = 1, maxPhase = 2, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { -// CHECK-LABEL: tt.func @load_two_users_incompatible_layouts -// AMD-LABEL: tt.func @load_two_users_incompatible_layouts +// COMMON-LABEL: tt.func @load_two_users_incompatible_layouts tt.func @load_two_users_incompatible_layouts(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> @@ -924,10 +902,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> // check that the load didn't get pipelined. - // CHECK-NOT: alloc - // CHECK: scf.for - // AMD-NOT: triton_gpu.local_store - // AMD: scf.for + // COMMON-NOT: alloc + // COMMON: scf.for %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> @@ -959,6 +935,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: triton_gpu.async_copy_global_to_local // CHECK: triton_gpu.async_commit_group // CHECK: scf.yield + // AMD-LABEL: tt.func public @nested_loops // AMD: scf.for // AMD: triton_gpu.local_alloc @@ -966,6 +943,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // AMD: scf.for // AMD: scf.yield // AMD-DIS: scf.yield + // // The following code has the structure: // @@ -1413,8 +1391,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // This test triggered some failure in the verifier, so we only // included a simple check for the kernel name. -// CHECK-LABEL: @load_convert_layout -// AMD-LABEL: @load_convert_layout +// COMMON-LABEL: @load_convert_layout #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> #ALs0 = #triton_gpu.slice<{parent=#AL, dim=0}> @@ -1465,8 +1442,7 @@ tt.func @load_convert_layout(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i3 // This test captured some ICE in MatmulLoopPipeline pass, so we only // included a simple check for the kernel name. -// CHECK-LABEL: @matmul_indirect_pipeline -// AMD-LABEL: @matmul_indirect_pipeline +// COMMON-LABEL: @matmul_indirect_pipeline #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 2], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : i32} { @@ -1509,10 +1485,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 : // ----- -// CHECK-LABEL: @dont_pipeline_128x1 -// CHECK-NOT: local_load{{.*}}128x1 -// AMD-LABEL: @dont_pipeline_128x1 -// AMD-NOT: local_load{{.*}}128x1 +// COMMON-LABEL: @dont_pipeline_128x1 +// COMMON-NOT: local_load{{.*}}128x1 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { @@ -1554,10 +1528,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // Check that the dependencies across ops of different nesting does not cause crash or // incorrect schedule that fails to pipeline. -// CHECK-LABEL: @matmul_nested_ops -// CHECK: triton_gpu.local_load -// AMD-LABEL: @matmul_nested_ops -// AMD: triton_gpu.local_load +// COMMON-LABEL: @matmul_nested_ops +// COMMON: triton_gpu.local_load #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -1627,10 +1599,8 @@ tt.func @matmul_nested_ops(%lb : index, %ub : index, %step : index, #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = true}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK-LABEL: dot_prologue_epilogue - // CHECK: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} - // AMD-LABEL: dot_prologue_epilogue - // AMD-SAME: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} + // COMMON-LABEL: dot_prologue_epilogue + // COMMON: {{.*}}, {{.*}}, %[[EXT:.*]]: i32, {{.*}} tt.func @dot_prologue_epilogue(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %ext: i32, %inc: tensor<64x16xi32, #blocked> {tt.divisibility = 16 : i32}) -> tensor<128x16xf32, #mma1> { %cst = arith.constant dense<0> : tensor<64x16xi32, #blocked> %cst2 = arith.constant dense<0> : tensor<128x64xi32, #blocked1> @@ -1653,28 +1623,17 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : %14 = tt.broadcast %10 : tensor<1x16x!tt.ptr, #blocked> -> tensor<64x16x!tt.ptr, #blocked> %15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked> %16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr, #blocked>, tensor<64x16xi32, #blocked> - // CHECK: %[[C0:.*]] = arith.constant 0 : i32 - // CHECK: scf.for %[[IND_VAR:.*]] = %[[C0]] - // CHECK-NOT load - // CHECK: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // CHECK: scf.if %[[CND]] - // CHECK: dot - // CHECK: scf.if %[[CND]] - // CHECK: arith.mulf - // CHECK: scf.yield - // CHECK-NOT: tt.addptr - // CHECK: scf.yield - // AMD: %[[C0:.*]] = arith.constant 0 : i32 - // AMD: scf.for %[[IND_VAR:.*]] = %[[C0]] to - // AMD-NOT: load - // AMD: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] - // AMD: scf.if %[[CND]] - // AMD: dot - // AMD: scf.if %[[CND]] - // AMD: arith.mulf - // AMD: scf.yield - // AMD-NOT: tt.addptr - // AMD: scf.yield + // COMMON: %[[C0:.*]] = arith.constant 0 : i32 + // COMMON: scf.for %[[IND_VAR:.*]] = %[[C0]] + // COMMON-NOT: load + // COMMON: %[[CND:.*]] = arith.cmpi slt, %[[IND_VAR]], %[[EXT]] + // COMMON: scf.if %[[CND]] + // COMMON: dot + // COMMON: scf.if %[[CND]] + // COMMON: arith.mulf + // COMMON: scf.yield + // COMMON-NOT: tt.addptr + // COMMON: scf.yield %17:3 = scf.for %arg3 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg4 = %cst_2, %arg5 = %16, %arg6 = %8) -> (tensor<128x16xf32, #mma1>, tensor<64x16x!tt.ptr, #blocked>, tensor<128x64x!tt.ptr, #blocked1>) : i32 { %9 = tt.load %arg6 : tensor<128x64x!tt.ptr, #blocked1> %cnd = arith.cmpi slt, %arg3, %ext : i32 @@ -1822,7 +1781,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : } } - // ----- #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}> From a27e45b41b44a67636e7eeb1722768dc5a81e759 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 06:47:10 +0000 Subject: [PATCH 28/36] Move one more test to cuda file --- test/TritonGPU/loop-pipeline-cuda.mlir | 37 ++++++++++++++++++++++++++ test/TritonGPU/loop-pipeline.mlir | 37 -------------------------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/test/TritonGPU/loop-pipeline-cuda.mlir b/test/TritonGPU/loop-pipeline-cuda.mlir index cb8f04a8f52f..b6610c0a663f 100644 --- a/test/TritonGPU/loop-pipeline-cuda.mlir +++ b/test/TritonGPU/loop-pipeline-cuda.mlir @@ -160,3 +160,40 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return } } // end module + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> +#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 256, 16]}> +#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { +// CHECK-LABEL: @matmul_tma +// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable> +// CHECK-COUNT-3: triton_nvidia_gpu.init_barrier +// CHECK-COUNT-4: triton_nvidia_gpu.async_tma_copy_global_to_local +// CHECK: scf.for +// CHECK: triton_nvidia_gpu.wait_barrier +// CHECK-NOT: triton_nvidia_gpu.wait_barrier +// CHECK-COUNT-2: triton_nvidia_gpu.async_tma_copy_global_to_local +// CHECK: scf.yield + tt.func public @matmul_tma(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x256xf32, #mma> { + %c256_i32 = arith.constant 256 : i32 + %c0_i32 = arith.constant 0 : i32 + %c64_i32 = arith.constant 64 : i32 + %c1_i32 = arith.constant 1 : i32 + %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> + %0:2 = scf.for %arg3 = %c0_i32 to %c256_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<128x256xf32, #mma>, i32) : i32 { + %1 = tt.experimental_descriptor_load %arg0[%c0_i32, %arg5] : !tt.ptr -> tensor<128x64xf16, #blocked> + %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> + %3 = tt.experimental_descriptor_load %arg1[%arg5, %c0_i32] : !tt.ptr -> tensor<64x256xf16, #blocked1> + %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> + %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> + %6 = arith.addi %arg5, %c64_i32 : i32 + scf.yield %5, %6 : tensor<128x256xf32, #mma>, i32 + } + tt.return %0#0 : tensor<128x256xf32, #mma> + } +} diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index 1246809c37af..d60b93403bd8 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -1780,40 +1780,3 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : tt.return } } - -// ----- - -#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}> -#mma = #triton_gpu.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 256, 16]}> -#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = true}> -module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { -// CHECK-LABEL: @matmul_tma -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x128x64xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3x64x256xf16, #{{.+}}, #triton_gpu.shared_memory, mutable> -// CHECK-DAG: triton_gpu.local_alloc : () -> !tt.memdesc<3xi64, #{{.+}}, #triton_gpu.shared_memory, mutable> -// CHECK-COUNT-3: triton_nvidia_gpu.init_barrier -// CHECK-COUNT-4: triton_nvidia_gpu.async_tma_copy_global_to_local -// CHECK: scf.for -// CHECK: triton_nvidia_gpu.wait_barrier -// CHECK-NOT: triton_nvidia_gpu.wait_barrier -// CHECK-COUNT-2: triton_nvidia_gpu.async_tma_copy_global_to_local -// CHECK: scf.yield - tt.func public @matmul_tma(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> tensor<128x256xf32, #mma> { - %c256_i32 = arith.constant 256 : i32 - %c0_i32 = arith.constant 0 : i32 - %c64_i32 = arith.constant 64 : i32 - %c1_i32 = arith.constant 1 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma> - %0:2 = scf.for %arg3 = %c0_i32 to %c256_i32 step %c1_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<128x256xf32, #mma>, i32) : i32 { - %1 = tt.experimental_descriptor_load %arg0[%c0_i32, %arg5] : !tt.ptr -> tensor<128x64xf16, #blocked> - %2 = triton_gpu.local_alloc %1 : (tensor<128x64xf16, #blocked>) -> !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> - %3 = tt.experimental_descriptor_load %arg1[%arg5, %c0_i32] : !tt.ptr -> tensor<64x256xf16, #blocked1> - %4 = triton_gpu.local_alloc %3 : (tensor<64x256xf16, #blocked1>) -> !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> - %5 = triton_nvidia_gpu.warp_group_dot %2, %4, %arg4 { inputPrecision = 0 : i32 } : !tt.memdesc<128x64xf16, #shared, #triton_gpu.shared_memory> * !tt.memdesc<64x256xf16, #shared, #triton_gpu.shared_memory> -> tensor<128x256xf32, #mma> - %6 = arith.addi %arg5, %c64_i32 : i32 - scf.yield %5, %6 : tensor<128x256xf32, #mma>, i32 - } - tt.return %0#0 : tensor<128x256xf32, #mma> - } -} From b2694d255f0f10b08433e755e29a65cc2e261346 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 18:40:59 +0000 Subject: [PATCH 29/36] Delete unused block layout --- .../StreamPipelineV2.cpp | 30 +------------------ 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 5812893586c7..034f0a6bb777 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -36,8 +36,6 @@ namespace { struct LoadInfo { // Shared layout is used for loads feeding into dot ops. ttg::SharedEncodingAttr sharedEncoding = nullptr; - // Blocked layout is used for loads not feeding into dot ops. - ttg::BlockedEncodingAttr blockedEncoding = nullptr; // The distance of this load's stage to its use' stage. int distToUse = 0; bool usedByDot = false; @@ -167,25 +165,6 @@ getSharedEncIfAllUsersAreDotEnc(Value val) { return attr; } -static ttg::BlockedEncodingAttr -getBlockedEncoding(tt::LoadOp loadOp, tt::ModuleAxisInfoAnalysis &axisInfo) { - Value src = loadOp.getPtr(); - auto ty = cast(src.getType()); - auto mod = loadOp->getParentOfType(); - int numWarps = ttg::TritonGPUDialect::getNumWarps(mod); - int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod); - tt::AxisInfo::DimVectorT contiguity = - axisInfo.getAxisInfo(src)->getContiguity(); - SmallVector order = argSort(contiguity); - unsigned currPerThread = getNumElementsPerThread(loadOp, order, axisInfo); - SmallVector sizePerThread(order.size(), 1); - sizePerThread[order[0]] = currPerThread; - ttg::CTALayoutAttr ctaLayout = ttg::getCTALayout(ty.getEncoding()); - return ttg::BlockedEncodingAttr::get(loadOp->getContext(), ty.getShape(), - sizePerThread, order, numWarps, - threadsPerWarp, ctaLayout); -} - // Create a map from load ops to their indirection levels and the final uses // of the load op (another load op, or a dot op). // @@ -248,7 +227,7 @@ assignMemoryLayouts(llvm::SmallVector> for (auto &[op, dist, use] : loadOpToIndLevelAndUse) { if (loadToInfo.count(op)) - // TODO We'd need to verify that the distance is the same + // TODO: We'd need to verify that the distance is the same. continue; LoadInfo loadInfo; @@ -296,13 +275,6 @@ assignMemoryLayouts(llvm::SmallVector> } } - // If we still don't have a shared encoding, try a "generic" shared - // encoding. - if (!loadInfo.sharedEncoding) { - // Also pipeline in-register buffers. - loadInfo.blockedEncoding = getBlockedEncoding(loadOp, axisInfoAnalysis); - } - loadToInfo[op] = loadInfo; } From bb931deb8ac17b8f3fb68954dd8015a18c01eee3 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 18:45:54 +0000 Subject: [PATCH 30/36] Add some asserts regarding num stages --- .../amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 034f0a6bb777..2ccf3022fa8e 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -115,8 +115,9 @@ static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, loadOp->replaceAllUsesWith(result); - // Prefetch load if is used by the dot. + // Prefetch load ahead of the dot stage if is used by the dot. if (loadToInfo[loadOp].usedByDot) { + assert(numStages >= 2 && "requires num_stages=2 at least"); schedule.insert(storeOp, numStages - 2, prefetchCluster); schedule.insert(viewLoad, numStages - 2, prefetchCluster); } @@ -325,6 +326,7 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, // The stage gap between chained loads--this allows us to "spread" loads // with a non-one step in case the number of stages given by the user is // large. + assert(numStages >= 2 && "requires num_stages=2 at least"); unsigned stagesBetweenLoads = llvm::divideCeil(numStages - 2, maxIndirectionLevel + 1); @@ -364,7 +366,7 @@ static void scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule, SmallVector> opsInOrder = schedule.getOpsInOrder(forOp); // Schedule dependencies stage by stage. - for (int stage = 0; stage < numStages; stage++) { + for (int stage = 0; stage < numStages; ++stage) { for (auto [op, stage_, cluster] : opsInOrder) { if (stage_ != stage) continue; From 10a2660fa931ca35b9703b7527ff6a62be3f7c9c Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 20:28:07 +0000 Subject: [PATCH 31/36] Some more debug prints --- .../TritonAMDGPUTransforms/StreamPipelineV2.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 2ccf3022fa8e..a901090705a9 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -320,15 +320,16 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, // Calculate the stage distance between applicable loads. int maxIndirectionLevel = -1; - for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) { + for (auto [loadOp, dist, use] : loadOpToIndLevelAndUse) maxIndirectionLevel = std::max(maxIndirectionLevel, dist); - } + // The stage gap between chained loads--this allows us to "spread" loads // with a non-one step in case the number of stages given by the user is // large. assert(numStages >= 2 && "requires num_stages=2 at least"); unsigned stagesBetweenLoads = llvm::divideCeil(numStages - 2, maxIndirectionLevel + 1); + LDBG("stagesBetweenLoads = " << stagesBetweenLoads); // Put the root uses of the loads in the last stage. tt::CoarseSchedule::Cluster rootUsersCluster = schedule.clusters.newAtFront(); @@ -356,6 +357,14 @@ scheduleLoads(scf::ForOp forOp, tt::CoarseSchedule &schedule, loadToInfo[loadOp].distToUse = schedule[use].first - schedule[loadOp].first; } + LLVM_DEBUG({ + LDBG("Chosen loads to pipeline:"); + for (const auto &[load, info] : loadToInfo) { + LDBG(" - load: " << *load); + LDBG(" distToUse: " << info.distToUse); + LDBG(" usedByDot: " << info.usedByDot); + } + }); return loadToInfo; } @@ -494,6 +503,7 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, int numBuffers = -1; for (auto &[_, info] : loadToInfo) numBuffers = std::max(numBuffers, info.distToUse); + LDBG("deduced shared memory buffer number = " << numBuffers); SmallVector allocs; SmallVector> loadToAllocs; @@ -663,7 +673,7 @@ static bool pipelineLoop(scf::ForOp forOp, int numStages) { if (failed(newForOp)) return false; - LDBG("Loop before expander:\n" << *newForOp); + LDBG("Loop before sending to expander:\n" << *newForOp); return true; } From 029cadb8b1da34c947a588aaacb2dbea70a11568 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 20:54:38 +0000 Subject: [PATCH 32/36] Remove unused insertindx --- test/TritonGPU/loop-pipeline.mlir | 50 ++++++------------- .../StreamPipelineV2.cpp | 28 +++-------- 2 files changed, 21 insertions(+), 57 deletions(-) diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index d60b93403bd8..a8c8f493d5df 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -69,13 +69,10 @@ // AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] // AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:7 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}-1_i32, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) +// AMD: %{{.*}}:6 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) // AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// AMD: %[[ADDI_22:.*]] = arith.addi %[[ARG9]], %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ADDI_22]], %{{.*}} -// AMD: %[[SELECT_24:.*]] = arith.select %[[CMPI_23]], %[[ADDI_22]], %{{.*}} // AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] // AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] // AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} @@ -93,7 +90,7 @@ // AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] // AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_24]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] @@ -204,13 +201,10 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, // AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] // AMD: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] -// AMD: %{{.*}}:7 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) +// AMD: %{{.*}}:6 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) // AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// AMD: %[[ADDI_23:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_24:.*]] = arith.cmpi slt, %[[ADDI_23]], %{{.*}} -// AMD: %[[SELECT_25:.*]] = arith.select %[[CMPI_24]], %[[ADDI_23]], %{{.*}} // AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] // AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] @@ -227,7 +221,7 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, // AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] // AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_25]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] @@ -314,12 +308,9 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, // AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} // AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// AMD: %{{.*}}:5 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}-1_i32, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// AMD: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) // AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] -// AMD: %[[ADDI_20:.*]] = arith.addi %[[ARG8]], %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ADDI_20]], %{{.*}} -// AMD: %[[SELECT_22:.*]] = arith.select %[[CMPI_21]], %[[ADDI_20]], %{{.*}} // AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] // AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] // AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} @@ -330,7 +321,7 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, // AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} // AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] -// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_22]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] @@ -414,13 +405,10 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] // AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] -// AMD: %{{.*}}:9 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) +// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) // AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// AMD: %[[ADDI_27:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_28:.*]] = arith.cmpi slt, %[[ADDI_27]], %{{.*}} -// AMD: %[[SELECT_29:.*]] = arith.select %[[CMPI_28]], %[[ADDI_27]], %{{.*}} // AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] // AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] // AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] @@ -441,7 +429,7 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] // AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] -// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_29]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] @@ -512,13 +500,10 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, // AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] // AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG11:.*]] = %{{.*}}-1_i32, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) +// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) // AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// AMD: %[[ADDI_19:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_20:.*]] = arith.cmpi slt, %[[ADDI_19]], %{{.*}} -// AMD: %[[SELECT_21:.*]] = arith.select %[[CMPI_20]], %[[ADDI_19]], %{{.*}} // AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] // AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] // AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] @@ -539,7 +524,7 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, // AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] // AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_21]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] @@ -619,15 +604,12 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, // AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] // AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG10:.*]] = %{{.*}}-1_i32, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) +// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) // AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] // AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} // AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] // AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] @@ -650,7 +632,7 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, // AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] // AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] // AMD: } // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] // AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] @@ -1038,15 +1020,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> // AMD-LABEL: tt.func @indirect_load_shared_layout -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) +// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) // AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] // AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} // AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[ADDI_24:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_25:.*]] = arith.cmpi slt, %[[ADDI_24]], %{{.*}} -// AMD: %[[SELECT_26:.*]] = arith.select %[[CMPI_25]], %[[ADDI_24]], %{{.*}} // AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] // AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] // AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] @@ -1069,7 +1048,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] // AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] // AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_26]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] // AMD: } #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> @@ -1745,7 +1724,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // AMD: tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] // AMD: scf.for // AMD: arith.select -// AMD: arith.select // AMD: arith.addf // AMD: %[[A:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] // AMD: %[[B:.*]] = tt.load {{.*}}, %{{.*}}, %[[CONSTANT]] diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index a901090705a9..667fa7804a73 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -56,8 +56,7 @@ static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { } static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, - Value insertIdx, Value extractIdx, - tt::CoarseSchedule &schedule, + Value extractIdx, tt::CoarseSchedule &schedule, tt::CoarseSchedule::Cluster prefetchCluster, llvm::MapVector &loadToInfo, int numStages) { @@ -71,7 +70,6 @@ static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, tt::MemDescType allocTy = cast(alloc.getType()); SmallVector copyOffsets(allocTy.getRank(), zero); - copyOffsets[0] = insertIdx; Operation *copy = builder.clone(*loadOp); auto [stage, cluster] = schedule[loadOp]; @@ -524,33 +522,22 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, Value minusOne = builder.create(loc, -1, 32); Value zero = builder.create(loc, 0, 32); Value one = builder.create(loc, 1, 32); - Value insertIdx = minusOne; Value extractIdx = minusOne; Value numBuffersVal = builder.create(loc, numBuffers, 32); - SmallVector newOperands; - newOperands.push_back(insertIdx); - newOperands.push_back(extractIdx); - unsigned newOperandIndex = forOp.getBody()->getNumArguments(); // Patch the loop to add the new loop carried dependencies. scf::ForOp newForOp = - replaceForOpWithNewSignature(builder, forOp, newOperands); + replaceForOpWithNewSignature(builder, forOp, {extractIdx}); forOp.erase(); forOp = newForOp; - // Create two counters for the insert and extract indices to avoid creating - // long liverange. - insertIdx = newForOp.getBody()->getArgument(newOperandIndex); - extractIdx = newForOp.getBody()->getArgument(newOperandIndex + 1); + // Create one counter for the extract indices to avoid creating long + // live range. + extractIdx = newForOp.getBody()->getArgument(newOperandIndex); builder.setInsertionPoint(newForOp.getBody(), newForOp.getBody()->begin()); - insertIdx = builder.create(loc, insertIdx, one); - Value cndIns = builder.create(loc, arith::CmpIPredicate::slt, - insertIdx, numBuffersVal); - insertIdx = builder.create(loc, cndIns, insertIdx, zero); - extractIdx = builder.create(loc, extractIdx, one); Value cndExt = builder.create(loc, arith::CmpIPredicate::slt, extractIdx, numBuffersVal); @@ -562,13 +549,12 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, for (auto &[op, alloc] : loadToAllocs) { if (auto loadOp = dyn_cast(op)) { - createStreamCopy(forOp, loadOp, alloc, insertIdx, extractIdx, schedule, + createStreamCopy(forOp, loadOp, alloc, extractIdx, schedule, prefetchCluster, loadToInfo, numStages); } } - SmallVector newYieldOperands = {insertIdx, extractIdx}; // Patch the yield with the updated counters. - appendToYield(forOp, newYieldOperands); + appendToYield(forOp, {extractIdx}); return allocs; } From 1e3068dcec62b2e3b97d4dfd725da08d69b3e931 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Fri, 26 Jul 2024 23:45:46 +0000 Subject: [PATCH 33/36] Fix debug print regarding loop before expander --- .../lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 667fa7804a73..fba78fcf42fc 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -650,17 +650,11 @@ static bool pipelineLoop(scf::ForOp forOp, int numStages) { tt::PipeliningOption options; if (!preprocessLoopAndBuildSchedule(forOp, numStages, options)) return false; + LDBG("Loop before sending to expander:\n" << *forOp); IRRewriter rewriter(forOp->getContext()); rewriter.setInsertionPoint(forOp); - FailureOr newForOp = - tt::pipelineForLoop(rewriter, forOp, options); - - if (failed(newForOp)) - return false; - - LDBG("Loop before sending to expander:\n" << *newForOp); - return true; + return succeeded(tt::pipelineForLoop(rewriter, forOp, options)); } namespace { @@ -676,9 +670,6 @@ struct PipelinePass : public TritonAMDGPUStreamPipelineV2Base { loops.push_back(forOp); }); - if (loops.empty()) - return; - for (scf::ForOp forOp : loops) pipelineLoop(forOp, getNumStagesOrDefault(forOp)); } From 98e831d5641ba1f488ebb46bef94f47ddf51ac7f Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sat, 27 Jul 2024 00:04:19 +0000 Subject: [PATCH 34/36] Create common utility for appendToForOpYield --- .../Dialect/TritonGPU/Transforms/Utility.h | 3 +++ .../Pipeliner/MatmulLoopPipeline.cpp | 14 +------------ lib/Dialect/TritonGPU/Transforms/Utility.cpp | 10 ++++++++++ .../StreamPipelineV2.cpp | 20 +++---------------- 4 files changed, 17 insertions(+), 30 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/Transforms/Utility.h b/include/triton/Dialect/TritonGPU/Transforms/Utility.h index 27a2a9dbdc66..98fae2326b42 100644 --- a/include/triton/Dialect/TritonGPU/Transforms/Utility.h +++ b/include/triton/Dialect/TritonGPU/Transforms/Utility.h @@ -140,6 +140,9 @@ scf::IfOp replaceIfOpWithNewSignature( RewriterBase &rewriter, scf::IfOp loop, TypeRange newResultTypes, SmallVectorImpl> &replacements); +// Append the given |newOperands| to the |forOp|'s yield op. +void appendToForOpYield(scf::ForOp forOp, ArrayRef newOperands); + Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op, IRMapping &mapping); diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp index e18d9312daa8..eb15f03bda91 100644 --- a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp @@ -51,18 +51,6 @@ struct LoadInfo { } // namespace -// Replace the ForOp's yield with a new one with the given operands appended. -static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { - // Fix up the yield op. - Operation *yieldOp = forOp.getBody()->getTerminator(); - SmallVector operands(yieldOp->getOperands()); - operands.append(newOperands.begin(), newOperands.end()); - - OpBuilder builder(yieldOp); - builder.create(yieldOp->getLoc(), operands); - yieldOp->erase(); -} - static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, Value insertIdx, Value extractIdx, tt::CoarseSchedule &schedule, @@ -1041,7 +1029,7 @@ createAsyncOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, if (phase) newYieldOperands.push_back(phase); // Patch the yield with the updated counters. - appendToYield(forOp, newYieldOperands); + appendToForOpYield(forOp, newYieldOperands); return allocs; } diff --git a/lib/Dialect/TritonGPU/Transforms/Utility.cpp b/lib/Dialect/TritonGPU/Transforms/Utility.cpp index be4e486a248b..eaf0a7e2a148 100644 --- a/lib/Dialect/TritonGPU/Transforms/Utility.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Utility.cpp @@ -627,6 +627,16 @@ scf::IfOp replaceIfOpWithNewSignature( return newIf; } +void appendToForOpYield(scf::ForOp forOp, ArrayRef newOperands) { + Operation *yieldOp = forOp.getBody()->getTerminator(); + SmallVector operands(yieldOp->getOperands()); + operands.append(newOperands.begin(), newOperands.end()); + + OpBuilder builder(yieldOp); + builder.create(yieldOp->getLoc(), operands); + yieldOp->erase(); +} + Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op, IRMapping &mapping) { Operation *newOp = rewriter.clone(*op, mapping); diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index fba78fcf42fc..84b0ce6ddcfd 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -43,18 +43,6 @@ struct LoadInfo { } // namespace -// Replace the forOp's yield with a new one with the given operands appended. -static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { - // Fix up the yield op. - Operation *yieldOp = forOp.getBody()->getTerminator(); - SmallVector operands(yieldOp->getOperands()); - operands.append(newOperands.begin(), newOperands.end()); - - OpBuilder builder(yieldOp); - builder.create(yieldOp->getLoc(), operands); - yieldOp->erase(); -} - static void createStreamCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc, Value extractIdx, tt::CoarseSchedule &schedule, tt::CoarseSchedule::Cluster prefetchCluster, @@ -543,18 +531,16 @@ createStreamOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule, extractIdx, numBuffersVal); extractIdx = builder.create(loc, cndExt, extractIdx, zero); - // Create a cluster for the prefetches. It may end up being empty, but this - // is OK. + // Create a cluster for prefetching global reads for the dot. tt::CoarseSchedule::Cluster prefetchCluster = schedule.clusters.newAtBack(); for (auto &[op, alloc] : loadToAllocs) { - if (auto loadOp = dyn_cast(op)) { + if (auto loadOp = dyn_cast(op)) createStreamCopy(forOp, loadOp, alloc, extractIdx, schedule, prefetchCluster, loadToInfo, numStages); - } } // Patch the yield with the updated counters. - appendToYield(forOp, {extractIdx}); + appendToForOpYield(forOp, {extractIdx}); return allocs; } From 7f1f8c1290446df89c54532abf558286d64061c0 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Sun, 28 Jul 2024 00:10:07 +0000 Subject: [PATCH 35/36] Clean up tests a bit --- test/TritonGPU/loop-pipeline-hip.mlir | 3 +- test/TritonGPU/loop-pipeline.mlir | 566 ++++++++++++-------------- 2 files changed, 268 insertions(+), 301 deletions(-) diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir index 323085009e13..657da5f31346 100644 --- a/test/TritonGPU/loop-pipeline-hip.mlir +++ b/test/TritonGPU/loop-pipeline-hip.mlir @@ -6,7 +6,7 @@ #shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}> #shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}> module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} { -// CHECK-LABEL: tt.func @load_two_users + // CHECK-LABEL: tt.func @load_two_users tt.func @load_two_users(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) { %cst = arith.constant dense<0> : tensor<1x16xi32, #blocked> %cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1> @@ -40,7 +40,6 @@ module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 // CHECK: tt.load // CHECK: triton_gpu.local_store // CHECK: scf.yield - %17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 { %18 = tt.load %16 : tensor<64x16x!tt.ptr, #blocked> %19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir index a8c8f493d5df..f7a1e8127cbf 100644 --- a/test/TritonGPU/loop-pipeline.mlir +++ b/test/TritonGPU/loop-pipeline.mlir @@ -58,43 +58,41 @@ // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_A]], %[[NEXT_B]] // AMD-LABEL: tt.func @matmul_loop -// AMD: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] -// AMD: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] -// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] -// AMD: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:6 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] -// AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} -// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] -// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} -// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] -// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } - -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] +// AMD: %[[LOCAL_ALLOC_10:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_12:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_14:.*]] = tt.load %{{.*}}, %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_12]] +// AMD: %[[LOAD_16:.*]] = tt.load %{{.*}}, %[[SPLAT_15]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_16]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:6 = scf.for %[[ARG5:[a-z0-9]*]] = +// AMD-SAME: iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_18]]) +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_20]] +// AMD: %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %[[ARG11]] +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_27]], %{{.*}} +// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[MULF_29]], %[[ARG8]] +// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG7]], %{{.*}} +// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]] +// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG10]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_10]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_10]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 : i32} { tt.func @matmul_loop(%lb : index, %ub : index, %step : index, @@ -188,45 +186,26 @@ tt.func @matmul_loop(%lb : index, %ub : index, %step : index, // CHECK: triton_gpu.async_copy_global_to_local // CHECK scf.yield -// AMD-LABEL: tt.func @matmul_loop_nested -// AMD: scf.for -// AMD: %[[LOCAL_ALLOC_11:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// AMD: %[[SPLAT_16:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_17:.*]] = tt.load %{{.*}}, %[[SPLAT_16]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %[[MEMDESC_SUBVIEW_19:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_17]], %[[MEMDESC_SUBVIEW_19]] -// AMD: %{{.*}}:6 = scf.for %[[ARG7:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_19]]) - -// AMD: %[[SUBI_21:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_22:.*]] = arith.cmpi slt, %[[ARG7]], %[[SUBI_21]] -// AMD: %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG14]] -// AMD: %[[DOT_30:.*]] = tt.dot %[[LOCAL_LOAD_26]], %[[LOCAL_LOAD_28]], %[[ARG10]] -// AMD: %[[ADDPTR_31:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]] -// AMD: %[[LOAD_34:.*]] = tt.load %[[ADDPTR_31]], %[[SPLAT_33]], %{{.*}} -// AMD: %[[SPLAT_35:.*]] = tt.splat %[[CMPI_22]] -// AMD: %[[LOAD_36:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_35]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_11]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[ADDPTR_31]], %[[ADDPTR_32]], %[[DOT_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_11]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] -// AMD: scf.yield %{{.*}}#2 -// AMD: } +// AMD-LABEL: tt.func @matmul_loop_nested +// AMD: scf.for +// AMD-COUNT-2: triton_gpu.local_alloc +// AMD-COUNT-2: tt.load +// AMD: %[[SUBVIEW0:.*]] = triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store %{{.+}}, %[[SUBVIEW0]] +// AMD: %[[SUBVIEW1:.*]] = triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store %{{.+}}, %[[SUBVIEW1]] +// AMD: %[[FOR:.*]]:6 = scf.for +// AMD-COUNT-2: triton_gpu.local_load +// AMD: tt.dot +// AMD-COUNT-2: tt.addptr +// AMD-COUNT-2: tt.load +// AMD: %[[SUBVIEW0:.*]] = triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store %{{.+}}, %[[SUBVIEW0]] +// AMD: %[[SUBVIEW1:.*]] = triton_gpu.memdesc_subview +// AMD: triton_gpu.local_store %{{.+}}, %[[SUBVIEW1]] +// AMD: scf.yield +// AMD-COUNT-2: triton_gpu.local_dealloc +// AMD: scf.yield %[[FOR]]#2 tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, @@ -300,30 +279,29 @@ tt.func @matmul_loop_nested(%lb : index, %ub : index, %step : index, // CHECK: scf.yield {{.*}}, %[[INS_IDX_3]], %[[EXT_IDX_3]], %[[NEXT_B]] // AMD-LABEL: tt.func @matmul_loop_single_pipeline -// AMD: %[[LOAD_10:.*]] = tt.load -// AMD: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] -// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] -// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] -// AMD: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) -// AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] -// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] -// AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} -// AMD: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] -// AMD: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} -// AMD: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} -// AMD: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} -// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] -// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] +// AMD: %[[LOAD_10:.*]] = tt.load +// AMD: %[[CONVERT_LAYOUT_11:.*]] = triton_gpu.convert_layout %[[LOAD_10]] +// AMD: %[[LOCAL_ALLOC_12:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_13:.*]] = arith.cmpi slt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_13]] +// AMD: %[[LOAD_15:.*]] = tt.load %{{.*}}, %[[SPLAT_14]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_16:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_15]], %[[MEMDESC_SUBVIEW_16]] +// AMD: %{{.*}}:4 = scf.for %[[ARG5:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %[[MEMDESC_SUBVIEW_16]]) +// AMD: %[[SUBI_18:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_19:.*]] = arith.cmpi slt, %[[ARG5]], %[[SUBI_18]] +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG10]] +// AMD: %[[DOT_25:.*]] = tt.dot %[[CONVERT_LAYOUT_11]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_26:.*]] = tt.addptr %[[ARG6]], %{{.*}} +// AMD: %[[SPLAT_27:.*]] = tt.splat %[[CMPI_19]] +// AMD: %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]], %{{.*}} +// AMD: %[[ADDI_29:.*]] = arith.addi %[[ARG9]], %{{.*}} +// AMD: %[[CMPI_30:.*]] = arith.cmpi slt, %[[ADDI_29]], %{{.*}} +// AMD: %[[SELECT_31:.*]] = arith.select %[[CMPI_30]], %[[ADDI_29]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_32:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_12]][%[[SELECT_31]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_32]] +// AMD: scf.yield %[[ADDPTR_26]], %[[DOT_25]], %[[SELECT_31]], %[[MEMDESC_SUBVIEW_32]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_12]] tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, %A : !tt.ptr {tt.divisibility = 16 : i32}, @@ -379,61 +357,59 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index, // CHECK: triton_gpu.async_wait {{.*}} {num = 2 : i32} // AMD-LABEL: tt.func @indirect_bmm_scalar -// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] -// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] -// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] -// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] -// AMD: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] -// AMD: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] -// AMD: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] -// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] -// AMD: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] -// AMD: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] -// AMD: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] -// AMD: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] -// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] -// AMD: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] -// AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] -// AMD: %{{.*}}:8 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) - -// AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] -// AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] -// AMD: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] -// AMD: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] -// AMD: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] -// AMD: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] -// AMD: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] -// AMD: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] -// AMD: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] -// AMD: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] -// AMD: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} -// AMD: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] -// AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] -// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] -// AMD: } - -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[LOAD_5:.*]] = tt.load %{{.*}}, %[[CMPI_2]] +// AMD: %[[MULI_6:.*]] = arith.muli %{{.*}}, %[[LOAD_5]] +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[MULI_6]] +// AMD: %[[ADDPTR_8:.*]] = tt.addptr %{{.*}}, %[[SPLAT_7]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_10:.*]] = tt.load %[[ADDPTR_8]], %[[SPLAT_9]] +// AMD: %[[CMPI_11:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_14:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_15:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_14]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_13]], %[[CMPI_11]] +// AMD: %[[MULI_17:.*]] = arith.muli %{{.*}}, %[[LOAD_16]] +// AMD: %[[SPLAT_18:.*]] = tt.splat %[[MULI_17]] +// AMD: %[[ADDPTR_19:.*]] = tt.addptr %{{.*}}, %[[SPLAT_18]] +// AMD: %[[SPLAT_20:.*]] = tt.splat %[[CMPI_11]] +// AMD: %[[LOAD_21:.*]] = tt.load %[[ADDPTR_19]], %[[SPLAT_20]] +// AMD: %[[MEMDESC_SUBVIEW_22:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_4]], %[[MEMDESC_SUBVIEW_22]] +// AMD: %[[MEMDESC_SUBVIEW_23:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_10]], %[[MEMDESC_SUBVIEW_23]] +// AMD: %{{.*}}:8 = scf.for %[[ARG6:[a-z0-9]*]] = +// AMD-SAME: iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %[[ADDPTR_12]], %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_22]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_23]], %[[ARG14:.*]] = %[[LOAD_15]], %[[ARG15:.*]] = %[[LOAD_21]]) +// AMD: %[[SUBI_25:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_26:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_25]] +// AMD: %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_34:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %[[ARG7]] +// AMD: %[[ADDPTR_35:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_37:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_38:.*]] = tt.load %[[ADDPTR_35]], %[[SPLAT_37]] +// AMD: %[[LOAD_39:.*]] = tt.load %[[ADDPTR_36]], %[[CMPI_26]] +// AMD: %[[MULI_40:.*]] = arith.muli %{{.*}}, %[[LOAD_39]] +// AMD: %[[SPLAT_41:.*]] = tt.splat %[[MULI_40]] +// AMD: %[[ADDPTR_42:.*]] = tt.addptr %{{.*}}, %[[SPLAT_41]] +// AMD: %[[SPLAT_43:.*]] = tt.splat %[[CMPI_26]] +// AMD: %[[LOAD_44:.*]] = tt.load %[[ADDPTR_42]], %[[SPLAT_43]] +// AMD: %[[ADDI_45:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_46:.*]] = arith.cmpi slt, %[[ADDI_45]], %{{.*}} +// AMD: %[[SELECT_47:.*]] = arith.select %[[CMPI_46]], %[[ADDI_45]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_48]] +// AMD: %[[MEMDESC_SUBVIEW_49:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_47]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[ARG15]], %[[MEMDESC_SUBVIEW_49]] +// AMD: scf.yield %[[DOT_34]], %[[ADDPTR_35]], %[[ADDPTR_36]], %[[SELECT_47]], %[[MEMDESC_SUBVIEW_48]], %[[MEMDESC_SUBVIEW_49]], %[[LOAD_38]], %[[LOAD_44]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, %76: index, @@ -482,52 +458,51 @@ tt.func @indirect_bmm_scalar(%77: i64 {tt.divisibility=16: i32}, // CHECK: scf.yield {{.*}}, {{.*}}, {{.*}}, %[[IND_BUFFER_0]] // AMD-LABEL: tt.func @indirect_bmm_scalar_dist_one -// AMD: %[[LOAD_0:.*]] = tt.load %{{.*}} -// AMD: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] -// AMD: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] -// AMD: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] -// AMD: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] -// AMD: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] -// AMD: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] -// AMD: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] -// AMD: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] -// AMD: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] -// AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] -// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) - -// AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] -// AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] -// AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] -// AMD: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] -// AMD: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] -// AMD: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] -// AMD: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] -// AMD: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] -// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} -// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} -// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] -// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] -// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] +// AMD: %[[LOAD_0:.*]] = tt.load %{{.*}} +// AMD: %[[ADDPTR_1:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[LOCAL_ALLOC_2:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_3:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_4:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_5:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_6:.*]] = tt.load %{{.*}}, %[[SPLAT_5]] +// AMD: %[[LOAD_7:.*]] = tt.load %[[ADDPTR_1]], %[[CMPI_4]] +// AMD: %[[MULI_8:.*]] = arith.muli %{{.*}}, %[[LOAD_0]] +// AMD: %[[SPLAT_9:.*]] = tt.splat %[[MULI_8]] +// AMD: %[[ADDPTR_10:.*]] = tt.addptr %{{.*}}, %[[SPLAT_9]] +// AMD: %[[SPLAT_11:.*]] = tt.splat %[[CMPI_4]] +// AMD: %[[LOAD_12:.*]] = tt.load %[[ADDPTR_10]], %[[SPLAT_11]] +// AMD: %[[ADDPTR_13:.*]] = tt.addptr %[[ADDPTR_1]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_14:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_6]], %[[MEMDESC_SUBVIEW_14]] +// AMD: %[[MEMDESC_SUBVIEW_15:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_12]], %[[MEMDESC_SUBVIEW_15]] +// AMD: %{{.*}}:7 = scf.for %[[ARG6:[a-z0-9]*]] = +// AMD-SAME: iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_13]], %[[ARG10:.*]] = %[[LOAD_7]], %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_14]], %[[ARG14:.*]] = %[[MEMDESC_SUBVIEW_15]]) +// AMD: %[[SUBI_17:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_18:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_17]] +// AMD: %[[LOCAL_LOAD_22:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %[[ARG14]] +// AMD: %[[DOT_26:.*]] = tt.dot %[[LOCAL_LOAD_22]], %[[LOCAL_LOAD_23]], %[[ARG7]] +// AMD: %[[ADDPTR_27:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[SPLAT_28:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_29:.*]] = tt.load %[[ADDPTR_27]], %[[SPLAT_28]] +// AMD: %[[LOAD_30:.*]] = tt.load %[[ARG9]], %[[CMPI_18]] +// AMD: %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[ARG10]] +// AMD: %[[SPLAT_32:.*]] = tt.splat %[[MULI_31]] +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %{{.*}}, %[[SPLAT_32]] +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_18]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_34]] +// AMD: %[[ADDPTR_36:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[ADDI_37:.*]] = arith.addi %[[ARG12]], %{{.*}} +// AMD: %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}} +// AMD: %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_2]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_29]], %[[MEMDESC_SUBVIEW_40]] +// AMD: %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_3]][%[[SELECT_39]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_41]] +// AMD: scf.yield %[[DOT_26]], %[[ADDPTR_27]], %[[ADDPTR_36]], %[[LOAD_30]], %[[SELECT_39]], %[[MEMDESC_SUBVIEW_40]], %[[MEMDESC_SUBVIEW_41]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_2]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_3]] tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, %76: index, @@ -583,59 +558,58 @@ tt.func @indirect_bmm_scalar_dist_one(%77: i64 {tt.divisibility=16: i32}, // CHECK: scf.yield // AMD-LABEL: tt.func @indirect_bmm_vector -// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc -// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc -// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] -// AMD: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} -// AMD: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} -// AMD: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] -// AMD: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} -// AMD: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] -// AMD: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] -// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] -// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] -// AMD: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] -// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] -// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] -// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] -// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] -// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// AMD: } -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] -// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] +// AMD: %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc +// AMD: %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc +// AMD: %[[CMPI_2:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_3:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_4:.*]] = tt.load %{{.*}}, %[[SPLAT_3]] +// AMD: %[[CMPI_5:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} +// AMD: %[[ADDPTR_6:.*]] = tt.addptr %{{.*}}, %{{.*}} +// AMD: %[[SPLAT_7:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_8:.*]] = tt.load %{{.*}}, %[[SPLAT_7]] +// AMD: %[[EXPAND_DIMS_9:.*]] = tt.expand_dims %[[LOAD_4]] {axis = 1 : i32} +// AMD: %[[BROADCAST_10:.*]] = tt.broadcast %[[EXPAND_DIMS_9]] +// AMD: %[[MULI_11:.*]] = arith.muli %{{.*}}, %[[BROADCAST_10]] +// AMD: %[[ADDPTR_12:.*]] = tt.addptr %{{.*}}, %[[MULI_11]] +// AMD: %[[SPLAT_13:.*]] = tt.splat %[[CMPI_2]] +// AMD: %[[LOAD_14:.*]] = tt.load %[[ADDPTR_12]], %[[SPLAT_13]] +// AMD: %[[SPLAT_15:.*]] = tt.splat %[[CMPI_5]] +// AMD: %[[LOAD_16:.*]] = tt.load %[[ADDPTR_6]], %[[SPLAT_15]] +// AMD: %[[MEMDESC_SUBVIEW_17:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_8]], %[[MEMDESC_SUBVIEW_17]] +// AMD: %[[MEMDESC_SUBVIEW_18:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%{{.*}}, %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_14]], %[[MEMDESC_SUBVIEW_18]] +// AMD: %{{.*}}:7 = scf.for %[[ARG6:[a-z0-9]*]] = +// AMD-SAME: iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %[[ADDPTR_6]], %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %[[MEMDESC_SUBVIEW_17]], %[[ARG13:.*]] = %[[MEMDESC_SUBVIEW_18]], %[[ARG14:.*]] = %[[LOAD_16]]) +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]] +// AMD: triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]] tt.func @indirect_bmm_vector(%77: tensor<16x16xi64, #BL> {tt.divisibility=16: i32, tt.constancy=16: i32}, %76: index, @@ -919,12 +893,12 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: scf.yield // AMD-LABEL: tt.func public @nested_loops -// AMD: scf.for -// AMD: triton_gpu.local_alloc -// AMD-NOT: triton_gpu.local_alloc -// AMD: scf.for -// AMD: scf.yield -// AMD-DIS: scf.yield +// AMD: scf.for +// AMD: triton_gpu.local_alloc +// AMD-NOT: triton_gpu.local_alloc +// AMD: scf.for +// AMD: scf.yield +// AMD-DIS: scf.yield // // The following code has the structure: @@ -939,14 +913,9 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // } // ``` // -// Only the outer for should be pipelined. The regression this tests -// causes an assertion to fail while pipelining the outer `for`, in -// particular while predicating the operations scheduled to be emitted -// in the prologue. -// -// We check that there is no allocation before the first occurrence of -// scf.for because that would mean that the first load `%a = load()` -// would be pipelined. +// For CUDA, we pipeline the inner loop first then pipeline the outer +// loop to prefetch the async copy after the inner loop. +// For HIP, we only pipeline the inner loop for now. #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [16, 8]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { @@ -1018,38 +987,37 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: triton_gpu.async_copy_global_to_local %[[NEXT_BUFFER_0]] // CHECK: triton_gpu.async_wait {{.*}} {num = 1 : i32} -// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> +// AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}> // AMD-LABEL: tt.func @indirect_load_shared_layout -// AMD: %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) - -// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] -// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} -// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] -// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] -// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] -// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] -// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} -// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} -// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] -// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} -// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] -// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] -// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] -// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] -// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] -// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] -// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] -// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} -// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} -// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} -// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] -// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] -// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] -// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] -// AMD: } +// AMD: %{{.*}}:7 = scf.for %[[ARG6:[a-z0-9]*]] = +// AMD-SAME: iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}}, %[[ARG14:.*]] = %{{.*}}) +// AMD: %[[SUBI_20:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_21:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_20]] +// AMD: %[[SUBI_22:.*]] = arith.subi %{{.*}}, %{{.*}} +// AMD: %[[CMPI_23:.*]] = arith.cmpi slt, %[[ARG6]], %[[SUBI_22]] +// AMD: %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[ARG12]] +// AMD: %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[ARG13]] +// AMD: %[[DOT_31:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[LOCAL_LOAD_28]], %[[ARG7]] +// AMD: %[[ADDPTR_32:.*]] = tt.addptr %[[ARG8]], %{{.*}} +// AMD: %[[ADDPTR_33:.*]] = tt.addptr %[[ARG9]], %{{.*}} +// AMD: %[[SPLAT_34:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_35:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_34]] +// AMD: %[[EXPAND_DIMS_36:.*]] = tt.expand_dims %[[ARG14]] {axis = 1 : i32} +// AMD: %[[BROADCAST_37:.*]] = tt.broadcast %[[EXPAND_DIMS_36]] +// AMD: %[[MULI_38:.*]] = arith.muli %{{.*}}, %[[BROADCAST_37]] +// AMD: %[[ADDPTR_39:.*]] = tt.addptr %{{.*}}, %[[MULI_38]] +// AMD: %[[SPLAT_40:.*]] = tt.splat %[[CMPI_23]] +// AMD: %[[LOAD_41:.*]] = tt.load %[[ADDPTR_39]], %[[SPLAT_40]] +// AMD: %[[SPLAT_42:.*]] = tt.splat %[[CMPI_21]] +// AMD: %[[LOAD_43:.*]] = tt.load %[[ADDPTR_33]], %[[SPLAT_42]] +// AMD: %[[ADDI_44:.*]] = arith.addi %[[ARG11]], %{{.*}} +// AMD: %[[CMPI_45:.*]] = arith.cmpi slt, %[[ADDI_44]], %{{.*}} +// AMD: %[[SELECT_46:.*]] = arith.select %[[CMPI_45]], %[[ADDI_44]], %{{.*}} +// AMD: %[[MEMDESC_SUBVIEW_47:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_35]], %[[MEMDESC_SUBVIEW_47]] +// AMD: %[[MEMDESC_SUBVIEW_48:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_46]], %{{.*}}, %{{.*}}] +// AMD: triton_gpu.local_store %[[LOAD_41]], %[[MEMDESC_SUBVIEW_48]] +// AMD: scf.yield %[[DOT_31]], %[[ADDPTR_32]], %[[ADDPTR_33]], %[[SELECT_46]], %[[MEMDESC_SUBVIEW_47]], %[[MEMDESC_SUBVIEW_48]], %[[LOAD_43]] #AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> #BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}> From 1bb58689ae4881393d71af0c582945cae2d355f0 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Mon, 29 Jul 2024 02:37:01 +0000 Subject: [PATCH 36/36] Reduce the level of nestedness --- .../StreamPipelineV2.cpp | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp index 84b0ce6ddcfd..a785cfd2ffec 100644 --- a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp +++ b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipelineV2.cpp @@ -397,27 +397,25 @@ static void scheduleDistanceOneDependencies(scf::ForOp forOp, if (stage == numStages - 1) continue; for (Value operand : getNestedOperands(&op)) { - if (auto arg = dyn_cast(operand)) { - if (arg.getArgNumber() > 0 && arg.getOwner() == op.getBlock()) { - auto yieldOp = op.getBlock()->getTerminator(); - Value v = yieldOp->getOperand(arg.getArgNumber() - 1); - Operation *defOp = v.getDefiningOp(); - if (defOp && schedule.count(defOp) == 0) { - if (isa(defOp)) { - // Exception: Schedule loads with a distance of 1 together - // with the current op. - schedule.insertIfAbsent(defOp, stage, cluster); - schedule.insertDepsOfOp(defOp, stage, cluster, true); - } else { - if (dist1Cluster.count(&cluster) == 0) { - dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster); - } - schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]); - schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster], - true); - } - } + auto arg = dyn_cast(operand); + if (!arg || arg.getArgNumber() == 0 || arg.getOwner() != op.getBlock()) + continue; + auto yieldOp = op.getBlock()->getTerminator(); + Value v = yieldOp->getOperand(arg.getArgNumber() - 1); + Operation *defOp = v.getDefiningOp(); + if (!defOp || schedule.count(defOp) != 0) + continue; + if (isa(defOp)) { + // Exception: schedule loads with a distance of 1 together with the + // current op. + schedule.insertIfAbsent(defOp, stage, cluster); + schedule.insertDepsOfOp(defOp, stage, cluster, true); + } else { + if (dist1Cluster.count(&cluster) == 0) { + dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster); } + schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]); + schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster], true); } } }