Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
b6eb13b
[AMD] Eliminate redundant barriers between back-to-back warp-pipeline…
jungpark-mlir Apr 2, 2026
c0bf9ee
[AMD] Support warp-pipeline for unrolled (flat) loops
jungpark-mlir Apr 5, 2026
6f36913
Merge branch 'triton-lang:main' into 2wp
jungpark-mlir Apr 5, 2026
bdf6076
[AMD] Refactor: extract emitPipelinePrelude/Postlude helpers
jungpark-mlir Apr 5, 2026
b5a9e4e
[AMD] Refactor: extract analyzePipelineDependencies helper
jungpark-mlir Apr 5, 2026
401e130
Format
jungpark-mlir Apr 5, 2026
482c64b
Remove unnecessary for_loop_depth assertion from warp_pipeline_stage
jungpark-mlir Apr 6, 2026
149cee3
Merge branch 'triton-lang:main' into 2wp
jungpark-mlir Apr 7, 2026
a96fbc7
Merge branch 'triton-lang:main' into 2wp
jungpark-mlir Apr 8, 2026
0c94338
Fix flat pipeline inserting redundant s_barrier when pre-existing bar…
jungpark-mlir Apr 8, 2026
cd58981
Add cross-pipeline and adjacent-stage LDS dependency analysis
jungpark-mlir Apr 14, 2026
58f429d
Collect all flat-pipeline stages for cross-pipeline dep analysis
jungpark-mlir Apr 18, 2026
f487677
format
jungpark-mlir Apr 18, 2026
35028b3
Merge branch 'triton-lang:main' into 2wp
jungpark-mlir Apr 18, 2026
6eac9b8
Simplify analyzePipelineDependencies into a single distance sweep
jungpark-mlir Apr 18, 2026
437a6e1
Tidy comments for terminology and section consistency
jungpark-mlir Apr 18, 2026
ae5d98c
Merge branch 'triton-lang:main' into 2wp
jungpark-mlir Apr 21, 2026
296a6cf
Merge branch 'main' into 2wp
jungpark-mlir Apr 22, 2026
138295b
WarpPipeliner: share helpers between createPipeline and createFlatPip…
jungpark-mlir Apr 26, 2026
4b3c2de
ConvertWarpPipeline: introduce isWarpPipelineIgnorableBarrier and get…
jungpark-mlir Apr 26, 2026
60c50fc
WarpPipeliner: add step-numbered comments to createFlatPipeline
jungpark-mlir Apr 26, 2026
611167a
address review comments
jungpark-mlir Apr 27, 2026
ce4ea7a
Merge branch 'main' into 2wp
jungpark-mlir Apr 27, 2026
5aceca5
fix test
jungpark-mlir Apr 27, 2026
cdbcad7
last few fixes
jungpark-mlir Apr 28, 2026
5ba68cb
Merge upstream main into 2wp
jungpark-mlir Apr 28, 2026
b36c134
Merge branch 'main' into 2wp
jungpark-mlir Apr 28, 2026
3a6eeee
Merge branch 'main' into 2wp
jungpark-mlir Apr 30, 2026
1346995
address review
jungpark-mlir Apr 30, 2026
bd5e919
Merge branch 'main' into 2wp
jungpark-mlir May 5, 2026
4a5cfe3
Merge branch 'main' into 2wp
jungpark-mlir May 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// RUN: triton-opt %s -split-input-file -convert-warp-pipeline="gfx-arch=gfx950" -verify-diagnostics

// validatePipelinedForBody runs upfront, before any IR mutation, so a
// malformed `pipelined_for` body fails the pass with no partial conversion.

// ==== Non-warp-pipeline scf.execute_region inside a pipelined_for body ====

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
tt.func @bad_unmarked_execute_region(%n: index, %ptr: !tt.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%v0 = arith.constant 0.0 : f32
%v1 = arith.constant 1.0 : f32

scf.for %i = %c0 to %n step %c1 {
scf.execute_region {
tt.store %ptr, %v0 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage0"}

// expected-error @+1 {{non-warp-pipeline scf.execute_region inside pipelined_for body}}
scf.execute_region {
tt.store %ptr, %v1 : !tt.ptr<f32>
scf.yield
}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}
}

// -----

// ==== Multiple pre-existing barriers between two stages ====

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
tt.func @bad_double_barrier_between_stages(%n: index, %ptr: !tt.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%v0 = arith.constant 0.0 : f32
%v1 = arith.constant 1.0 : f32

scf.for %i = %c0 to %n step %c1 {
scf.execute_region {
tt.store %ptr, %v0 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage0"}

amdg.async_wait {num_inst = 0 : i32}
// expected-error @+1 {{multiple pre-existing barriers between pipeline stages}}
amdg.async_wait {num_inst = 0 : i32}

scf.execute_region {
tt.store %ptr, %v1 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage1"}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}
}

// -----

// ==== Both top-of-loop and bottom-of-loop pre-existing barriers ====

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
tt.func @bad_top_and_bottom_barriers(%n: index, %ptr: !tt.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%v0 = arith.constant 0.0 : f32
%v1 = arith.constant 1.0 : f32

// expected-error @+1 {{both top-of-loop and bottom-of-loop pre-existing barriers}}
scf.for %i = %c0 to %n step %c1 {
amdg.async_wait {num_inst = 0 : i32}

scf.execute_region {
tt.store %ptr, %v0 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage0"}

scf.execute_region {
tt.store %ptr, %v1 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage1"}

amdg.async_wait {num_inst = 0 : i32}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}
}

// -----

// ==== Unexpected op inside a pipelined_for body ====
//
// Anything that is not a warp-pipeline stage, an ignorable barrier/wait,
// or scf.yield must be rejected upfront.

module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
tt.func @bad_unexpected_op_in_body(%n: index, %ptr: !tt.ptr<f32>) {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%v0 = arith.constant 0.0 : f32
%v1 = arith.constant 1.0 : f32

scf.for %i = %c0 to %n step %c1 {
scf.execute_region {
tt.store %ptr, %v0 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage0"}

// expected-error @+1 {{unexpected op inside pipelined_for body}}
%x = arith.addi %i, %c1 : index

scf.execute_region {
tt.store %ptr, %v1 : !tt.ptr<f32>
scf.yield
} {triton.warp_pipeline.stage = "stage1"}

scf.yield
} {triton.warp_pipeline.pipelined_for}

tt.return
}
}
Loading
Loading