Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
4ad6675
[AMD-Pipeliner] Transition stream-pipeline to new SW pipelining infra…
Jun 17, 2024
f06e622
[AMD-Reorder] Move `tt.load`s as early as possible
Jun 17, 2024
047c2c1
* consolidated/fixed stream-pipeliner tests
Jun 17, 2024
989150f
* updated test
Jun 17, 2024
5091416
* Find insertion point for loads/local_stores as early as possible
Jun 20, 2024
d42830b
* Reorder with BFS to keep relative order.
Jun 25, 2024
768ed95
* fixed pruning
Jun 25, 2024
452a3fa
* updated test
Jun 26, 2024
e344245
* invert order of loads and local_stores
Jun 27, 2024
cd8018d
* Removed outer loop pipelining. It does not improve perf and may be …
Jul 16, 2024
faf95cb
* cleanup tests
Jul 17, 2024
c0ff506
* Restore old stream-pipeliner and moved new to StreamPipelineV2.cpp
Jul 22, 2024
96c326b
* register new pass tritonamdgpu-stream-pipeline-v2
Jul 22, 2024
e4a89b3
* update tests
Jul 22, 2024
ee98933
Swap to disable new pipeline by default
antiagainst Jul 23, 2024
c464a84
Drop unused header includes
antiagainst Jul 23, 2024
1ceb6c6
Drop changes to be exposed in future pull requests
antiagainst Jul 24, 2024
3353b7d
[TEST] Drop irrelevant NVIDIA specific attributes
antiagainst Jul 24, 2024
c82defc
Drop unused chained load logic
antiagainst Jul 24, 2024
9c91b31
Add debug print
antiagainst Jul 24, 2024
181e37d
Drop uncessary canonicalization and cleanup some tests
antiagainst Jul 24, 2024
e4f76af
Merge remote-tracking branch 'origin/main' into sjw-pipeline-infra
antiagainst Jul 24, 2024
fb694d1
Various improvements
antiagainst Jul 25, 2024
9bbf5c9
NFC: change check prefix to AMD
antiagainst Jul 25, 2024
02b7073
Drop debug print \n
antiagainst Jul 25, 2024
c782668
[test] NFC: split loop pipeline test to prepare sharing
antiagainst Jul 25, 2024
8232d1a
Merge tests back to the main file
antiagainst Jul 26, 2024
f3e311e
Use COMMON prefix for shared check lines
antiagainst Jul 26, 2024
a27e45b
Move one more test to cuda file
antiagainst Jul 26, 2024
8be2969
Merge remote-tracking branch 'origin/main' into sjw-pipeline-infra
antiagainst Jul 26, 2024
b2694d2
Delete unused block layout
antiagainst Jul 26, 2024
bb931de
Add some asserts regarding num stages
antiagainst Jul 26, 2024
10a2660
Some more debug prints
antiagainst Jul 26, 2024
029cadb
Remove unused insertindx
antiagainst Jul 26, 2024
1e3068d
Fix debug print regarding loop before expander
antiagainst Jul 26, 2024
98e831d
Create common utility for appendToForOpYield
antiagainst Jul 27, 2024
7f1f8c1
Clean up tests a bit
antiagainst Jul 28, 2024
1bb5868
Reduce the level of nestedness
antiagainst Jul 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::registerTritonAMDGPUOptimizeEpilogue();
mlir::registerTritonAMDGPUReorderInstructions();
mlir::registerTritonAMDGPUStreamPipeline();
mlir::registerTritonAMDGPUStreamPipelineV2();

// TODO: register Triton & TritonGPU passes
registry.insert<mlir::triton::TritonDialect, mlir::cf::ControlFlowDialect,
Expand Down
3 changes: 3 additions & 0 deletions include/triton/Dialect/TritonGPU/Transforms/Utility.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ scf::IfOp replaceIfOpWithNewSignature(
RewriterBase &rewriter, scf::IfOp loop, TypeRange newResultTypes,
SmallVectorImpl<std::tuple<Value, Value>> &replacements);

// Append the given |newOperands| to the |forOp|'s yield op.
void appendToForOpYield(scf::ForOp forOp, ArrayRef<Value> newOperands);

Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
IRMapping &mapping);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,6 @@ struct LoadInfo {

} // namespace

// Replace the ForOp's yield with a new one with the given operands appended.
static void appendToYield(scf::ForOp forOp, ArrayRef<Value> newOperands) {
// Fix up the yield op.
Operation *yieldOp = forOp.getBody()->getTerminator();
SmallVector<Value> operands(yieldOp->getOperands());
operands.append(newOperands.begin(), newOperands.end());

OpBuilder builder(yieldOp);
builder.create<scf::YieldOp>(yieldOp->getLoc(), operands);
yieldOp->erase();
}

static void createAsyncCopy(scf::ForOp &forOp, tt::LoadOp loadOp, Value alloc,
Value insertIdx, Value extractIdx,
tt::CoarseSchedule &schedule,
Expand Down Expand Up @@ -1041,7 +1029,7 @@ createAsyncOps(scf::ForOp &forOp, tt::CoarseSchedule &schedule,
if (phase)
newYieldOperands.push_back(phase);
// Patch the yield with the updated counters.
appendToYield(forOp, newYieldOperands);
appendToForOpYield(forOp, newYieldOperands);

return allocs;
}
Expand Down
10 changes: 10 additions & 0 deletions lib/Dialect/TritonGPU/Transforms/Utility.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,16 @@ scf::IfOp replaceIfOpWithNewSignature(
return newIf;
}

void appendToForOpYield(scf::ForOp forOp, ArrayRef<Value> newOperands) {
Operation *yieldOp = forOp.getBody()->getTerminator();
SmallVector<Value> operands(yieldOp->getOperands());
operands.append(newOperands.begin(), newOperands.end());

OpBuilder builder(yieldOp);
builder.create<scf::YieldOp>(yieldOp->getLoc(), operands);
yieldOp->erase();
}

Operation *cloneWithInferType(mlir::OpBuilder &rewriter, Operation *op,
IRMapping &mapping) {
Operation *newOp = rewriter.clone(*op, mapping);
Expand Down
44 changes: 0 additions & 44 deletions test/TritonGPU/amd/amd-stream-pipeline.mlir

This file was deleted.

161 changes: 161 additions & 0 deletions test/TritonGPU/loop-pipeline-hip.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline-v2=num_stages=2 -canonicalize | FileCheck %s

#blocked = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 4], order = [0, 1]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1], hasLeadingOffset = false}>
#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0], hasLeadingOffset = false}>
module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
// CHECK-LABEL: tt.func @load_two_users
tt.func @load_two_users(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) {
%cst = arith.constant dense<0> : tensor<1x16xi32, #blocked>
%cst_0 = arith.constant dense<0> : tensor<128x1xi32, #blocked1>
%c0_i64 = arith.constant 0 : i64
%c0_i32 = arith.constant 0 : i32
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma>
%cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
%c1_i32 = arith.constant 1 : i32
%c8_i32 = arith.constant 8 : i32
%0 = tt.addptr %arg0, %c0_i64 : !tt.ptr<f16>, i64
%1 = tt.addptr %arg1, %c0_i64 : !tt.ptr<f16>, i64
%2 = tt.splat %1 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked1>
%3 = tt.addptr %2, %cst_0 : tensor<128x1x!tt.ptr<f16>, #blocked1>, tensor<128x1xi32, #blocked1>
%4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
%5 = tt.expand_dims %4 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
%6 = tt.broadcast %3 : tensor<128x1x!tt.ptr<f16>, #blocked1> -> tensor<128x64x!tt.ptr<f16>, #blocked1>
%7 = tt.broadcast %5 : tensor<1x64xi32, #blocked1> -> tensor<128x64xi32, #blocked1>
%8 = tt.addptr %6, %7 : tensor<128x64x!tt.ptr<f16>, #blocked1>, tensor<128x64xi32, #blocked1>
%9 = tt.load %8 : tensor<128x64x!tt.ptr<f16>, #blocked1>
%10 = tt.splat %0 : !tt.ptr<f16> -> tensor<1x16x!tt.ptr<f16>, #blocked>
%11 = tt.addptr %10, %cst : tensor<1x16x!tt.ptr<f16>, #blocked>, tensor<1x16xi32, #blocked>
%12 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
%13 = tt.expand_dims %12 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
%14 = tt.broadcast %11 : tensor<1x16x!tt.ptr<f16>, #blocked> -> tensor<64x16x!tt.ptr<f16>, #blocked>
%15 = tt.broadcast %13 : tensor<64x1xi32, #blocked> -> tensor<64x16xi32, #blocked>
%16 = tt.addptr %14, %15 : tensor<64x16x!tt.ptr<f16>, #blocked>, tensor<64x16xi32, #blocked>
// CHECK: triton_gpu.local_store
// CHECK: scf.for
// CHECK: tt.dot
// CHECK: tt.dot
// CHECK: tt.load
// CHECK: triton_gpu.local_store
// CHECK: scf.yield
%17:2 = scf.for %arg2 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg3 = %cst_1, %arg4 = %cst_2) -> (tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>) : i32 {
%18 = tt.load %16 : tensor<64x16x!tt.ptr<f16>, #blocked>
%19 = triton_gpu.convert_layout %9 : tensor<128x64xf16, #blocked1> -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
%20 = triton_gpu.convert_layout %18 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
%21 = tt.dot %19, %20, %cst_1 : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<64x16xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x16xf32, #mma>
%22 = arith.truncf %21 : tensor<128x16xf32, #mma> to tensor<128x16xf16, #mma>
%23 = triton_gpu.convert_layout %22 : tensor<128x16xf16, #mma> -> tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
%24 = triton_gpu.local_alloc %18 : (tensor<64x16xf16, #blocked>) -> !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable>
%25 = tt.trans %24 {order=array<i32: 1,0>} : !tt.memdesc<64x16xf16, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable>
%26 = triton_gpu.local_load %25 : !tt.memdesc<16x64xf16, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
%27 = tt.dot %23, %26, %arg4 : tensor<128x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<16x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x64xf32, #mma>
scf.yield %21, %27 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>
}
tt.return %17#0, %17#1 : tensor<128x16xf32, #mma>, tensor<128x64xf32, #mma>
}
}

// -----

// CHECK-LABEL: tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de
// CHECK-NOT: triton_gpu.convert_layout {{.*}} : tensor<32x64xf32, #shared> -> tensor<32x64xf32, #shared1>

#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
#shared = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [0, 1], hasLeadingOffset = false}>
#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 4, order = [1, 0], hasLeadingOffset = false}>
module attributes {"triton_gpu.target" = "hip:gfx942", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
tt.func public @_jagged_hstu_attn_fwd_0d1d2d3d4d5de(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<i64> {tt.divisibility = 16 : i32}, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32, tt.max_divisibility = 8 : i32}) attributes {noinline = false} {
%cst = arith.constant dense<0.000000e+00> : tensor<64x32xf32, #mma>
%c64_i32 = arith.constant 64 : i32
%c0_i32 = arith.constant 0 : i32
%c32_i32 = arith.constant 32 : i32
%0 = tt.get_program_id x : i32
%1 = arith.muli %0, %c64_i32 : i32
%2 = tt.get_program_id y : i32
%3 = tt.load %arg3 : !tt.ptr<i64>
%4 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
%5 = tt.splat %1 : i32 -> tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
%6 = arith.addi %5, %4 : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
%7 = tt.expand_dims %6 {axis = 1 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
%8 = tt.splat %3 : i64 -> tensor<64x1xi64, #blocked>
%9 = arith.extsi %7 : tensor<64x1xi32, #blocked> to tensor<64x1xi64, #blocked>
%10 = arith.addi %8, %9 : tensor<64x1xi64, #blocked>
%11 = arith.extsi %arg5 : i32 to i64
%12 = tt.splat %11 : i64 -> tensor<64x1xi64, #blocked>
%13 = arith.muli %10, %12 : tensor<64x1xi64, #blocked>
%14 = arith.muli %2, %arg5 : i32
%15 = arith.extsi %14 : i32 to i64
%16 = tt.splat %15 : i64 -> tensor<64x1xi64, #blocked>
%17 = arith.addi %13, %16 : tensor<64x1xi64, #blocked>
%18 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%19 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
%20 = tt.expand_dims %18 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
%21 = tt.expand_dims %19 {axis = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x64xi32, #blocked1>
%22 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked>
%23 = tt.splat %arg5 : i32 -> tensor<1x64xi32, #blocked1>
%24 = arith.muli %20, %22 : tensor<1x64xi32, #blocked>
%25 = arith.muli %21, %23 : tensor<1x64xi32, #blocked1>
%26 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x64xi64, #blocked>
%27 = arith.extsi %24 : tensor<1x64xi32, #blocked> to tensor<1x64xi64, #blocked>
%28 = arith.extsi %25 : tensor<1x64xi32, #blocked1> to tensor<1x64xi64, #blocked1>
%29 = tt.broadcast %27 : tensor<1x64xi64, #blocked> -> tensor<64x64xi64, #blocked>
%30 = arith.addi %26, %29 : tensor<64x64xi64, #blocked>
%31 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
%32 = tt.expand_dims %31 {axis = 1 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> -> tensor<32x1xi32, #blocked1>
%33 = tt.splat %3 : i64 -> tensor<32x1xi64, #blocked1>
%34 = arith.extsi %32 : tensor<32x1xi32, #blocked1> to tensor<32x1xi64, #blocked1>
%35 = arith.addi %33, %34 : tensor<32x1xi64, #blocked1>
%36 = tt.splat %11 : i64 -> tensor<32x1xi64, #blocked1>
%37 = arith.muli %35, %36 : tensor<32x1xi64, #blocked1>
%38 = tt.splat %15 : i64 -> tensor<32x1xi64, #blocked1>
%39 = arith.addi %37, %38 : tensor<32x1xi64, #blocked1>
%40 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x64xi64, #blocked1>
%41 = tt.broadcast %28 : tensor<1x64xi64, #blocked1> -> tensor<32x64xi64, #blocked1>
%42 = arith.addi %40, %41 : tensor<32x64xi64, #blocked1>
%43 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>
%44 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>>
%45 = tt.expand_dims %43 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x32xi32, #blocked1>
%46 = tt.expand_dims %44 {axis = 0 : i32} : tensor<32xi32, #triton_gpu.slice<{dim = 0, parent = #blocked}>> -> tensor<1x32xi32, #blocked>
%47 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked1>
%48 = tt.splat %arg5 : i32 -> tensor<1x32xi32, #blocked>
%49 = arith.muli %45, %47 : tensor<1x32xi32, #blocked1>
%50 = arith.muli %46, %48 : tensor<1x32xi32, #blocked>
%51 = tt.broadcast %39 : tensor<32x1xi64, #blocked1> -> tensor<32x32xi64, #blocked1>
%52 = arith.extsi %49 : tensor<1x32xi32, #blocked1> to tensor<1x32xi64, #blocked1>
%53 = arith.extsi %50 : tensor<1x32xi32, #blocked> to tensor<1x32xi64, #blocked>
%54 = tt.broadcast %52 : tensor<1x32xi64, #blocked1> -> tensor<32x32xi64, #blocked1>
%55 = arith.addi %51, %54 : tensor<32x32xi64, #blocked1>
%56 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x64x!tt.ptr<f32>, #blocked>
%57 = tt.addptr %56, %30 : tensor<64x64x!tt.ptr<f32>, #blocked>, tensor<64x64xi64, #blocked>
%58 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<32x64x!tt.ptr<f32>, #blocked1>
%59 = tt.addptr %58, %42 : tensor<32x64x!tt.ptr<f32>, #blocked1>, tensor<32x64xi64, #blocked1>
%60 = tt.splat %arg2 : !tt.ptr<f32> -> tensor<32x32x!tt.ptr<f32>, #blocked1>
%61 = tt.addptr %60, %55 : tensor<32x32x!tt.ptr<f32>, #blocked1>, tensor<32x32xi64, #blocked1>
%62 = tt.load %57 : tensor<64x64x!tt.ptr<f32>, #blocked>
%63 = scf.for %arg6 = %c0_i32 to %c64_i32 step %c32_i32 iter_args(%arg7 = %cst) -> (tensor<64x32xf32, #mma>) : i32 {
%70 = tt.load %59 : tensor<32x64x!tt.ptr<f32>, #blocked1>
%71 = triton_gpu.convert_layout %62 : tensor<64x64xf32, #blocked> -> tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
%72 = triton_gpu.local_alloc %70 : (tensor<32x64xf32, #blocked1>) -> !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable>
%73 = tt.trans %72 {order=array<i32: 1,0>} : !tt.memdesc<32x64xf32, #shared, #triton_gpu.shared_memory, mutable> -> !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable>
%74 = triton_gpu.local_load %73 : !tt.memdesc<64x32xf32, #shared1, #triton_gpu.shared_memory, mutable> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
%75 = tt.dot %71, %74, %cst : tensor<64x64xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma>
%76 = tt.load %61 : tensor<32x32x!tt.ptr<f32>, #blocked1>
%77 = triton_gpu.convert_layout %75 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>
%78 = triton_gpu.convert_layout %76 : tensor<32x32xf32, #blocked1> -> tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>>
%79 = tt.dot %77, %78, %arg7 : tensor<64x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 1}>> -> tensor<64x32xf32, #mma>
scf.yield %79 : tensor<64x32xf32, #mma>
}
%64 = tt.broadcast %17 : tensor<64x1xi64, #blocked> -> tensor<64x32xi64, #blocked>
%65 = tt.broadcast %53 : tensor<1x32xi64, #blocked> -> tensor<64x32xi64, #blocked>
%66 = arith.addi %64, %65 : tensor<64x32xi64, #blocked>
%67 = tt.splat %arg4 : !tt.ptr<f32> -> tensor<64x32x!tt.ptr<f32>, #blocked>
%68 = tt.addptr %67, %66 : tensor<64x32x!tt.ptr<f32>, #blocked>, tensor<64x32xi64, #blocked>
%69 = triton_gpu.convert_layout %63 : tensor<64x32xf32, #mma> -> tensor<64x32xf32, #blocked>
tt.store %68, %69 : tensor<64x32x!tt.ptr<f32>, #blocked>
tt.return
}
} // end module
Loading