From b6eb13b14ade455c8c98d0c8ff362b3e2142d586 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Thu, 2 Apr 2026 14:00:49 +0000
Subject: [PATCH 01/19] [AMD] Eliminate redundant barriers between back-to-back
 warp-pipelined loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When two warp-pipelined loops execute consecutively, ConvertWarpPipeline
previously emitted a full reconverge/re-phase-shift/pre-barrier sequence
between them:

    scf.for { loop 1 }
    cond_barrier(warpLow)     ← post-loop reconverge
    ttg.barrier local         ← pre-barrier for loop 2
    cond_barrier(warpHigh)    ← pre-loop phase shift
    scf.for { loop 2 }

The post-loop reconverge and pre-loop phase shift are complementary
predicates on the same counter-based S_BARRIER, so they cancel out.
The intervening ttg.barrier local is redundant when loop 1's
wrap-around cluster barrier already includes a local fence (i.e. the
dependency analysis determined an LDS read/write hazard exists across
the wrap-around point). In that case, all pending LDS writes are
already resolved before loop 1 yields, and ModuleMembarAnalysis will
not need to insert additional barriers between the loops.

This patch adds a post-processing pass (eliminateRedundantCondBarriers)
that detects this pattern and erases the three redundant ops, reducing
the barrier overhead to:

    scf.for { loop 1 }
    scf.for { loop 2 }
    cond_barrier(warpLow)     ← final reconverge only

The pass runs after all scf.for loops have been converted (patternFor)
but before execute_regions are inlined (patternInline), preserving the
scf.for / cond_barrier adjacency needed for pattern matching.

Also updates the f16_gemm_warp_pipeline_gfx1250.py example to use
range() (producing scf.for) instead of static_range() (which unrolls
at the Python level) for the epilogue loop, and wraps its stages in
warp_pipeline_stage annotations so the back-to-back optimization can
apply.
---
 .../amd/amd-convert-warp-pipeline.mlir        |  91 +++++++++++++++
 .../ConvertWarpPipeline.cpp                   | 104 ++++++++++++++++++
 .../gluon/f16_gemm_warp_pipeline_gfx1250.py   |  10 +-
 3 files changed, 201 insertions(+), 4 deletions(-)

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index ee1bcf22d982..0d17ebe02a83 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -445,3 +445,94 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // CHECK: rocdl.sched.barrier
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
+
+// -----
+
+// ---- Back-to-back pipelined loops: redundant cond_barriers eliminated ----
+//
+// Both loops have local_load (read) in stage0 and local_store (write) in
+// stage1, creating a wrap-around ttg.barrier local.  The post-loop
+// cond_barrier of loop 1, the pre-barrier of loop 2, and the pre-loop
+// cond_barrier of loop 2 should all be eliminated because loop 1's
+// wrap-around barrier already includes a local fence.
+//
+// Expected:
+//   ttg.barrier local          (pre-barrier for loop 1)
+//   amdg.cond_barrier          (#1 phase shift for loop 1)
+//   scf.for { loop 1 }
+//   NO amdg.cond_barrier       (#2 eliminated)
+//   NO ttg.barrier local       (pre-barrier eliminated)
+//   NO amdg.cond_barrier       (#3 eliminated)
+//   scf.for { loop 2 }
+//   amdg.cond_barrier          (#4 reconverge for loop 2)
+
+#b2b_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#b2b_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#b2b_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#b2b_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @back_to_back_elimination(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x256xf32, #b2b_mma>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #b2b_blocked>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+
+    // Loop 1: stage0 reads LDS, stage1 writes LDS → wrap-around is ttg.barrier local
+    %r1:2 = scf.for %i = %lb to %ub step %step
+        iter_args(%a1 = %acc, %s1 = %smem)
+        -> (tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>) : i32 {
+      %ld1 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %s1[0, 0] : !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2b_shared, #b2b_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #b2b_shared, #b2b_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      %st1 = scf.execute_region -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable> no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #b2b_blocked>
+        ttg.local_store %data, %s1 : tensor<256x64xf16, #b2b_blocked> -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+        scf.yield %s1 : !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+      } {triton.warp_pipeline.stage = "global_load_and_store"}
+
+      scf.yield %a1, %st1 : tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Loop 2: same structure (read + write LDS) so it is not optimized away
+    %r2:2 = scf.for %j = %lb to %ub step %step
+        iter_args(%a2 = %r1#0, %s2 = %r1#1)
+        -> (tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>) : i32 {
+      %ld2 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>> no_inline {
+        %sub2 = ttg.memdesc_subslice %s2[0, 0] : !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2b_shared, #b2b_smem, mutable, 256x64>
+        %v2 = ttg.local_load %sub2 : !ttg.memdesc<256x16xf16, #b2b_shared, #b2b_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>>
+        scf.yield %v2 : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2b_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "epilogue_lds_load"}
+
+      %st2 = scf.execute_region -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable> no_inline {
+        %data2 = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #b2b_blocked>
+        ttg.local_store %data2, %s2 : tensor<256x64xf16, #b2b_blocked> -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+        scf.yield %s2 : !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+      } {triton.warp_pipeline.stage = "epilogue_global_load_and_store"}
+
+      scf.yield %a2, %st2 : tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @back_to_back_elimination
+// Pre-barrier and phase shift for loop 1 are kept.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Wrap-around barrier inside loop 1 (local fence from LDS dependency).
+// CHECK: ttg.barrier local
+// CHECK: scf.yield
+// Between the two loops: no cond_barriers, no ttg.barrier local.
+// CHECK-NOT: amdg.cond_barrier
+// CHECK-NOT: ttg.barrier local
+// CHECK: scf.for
+// Post-loop reconverge for loop 2 is kept.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 6618375821b3..19035daa707a 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -346,6 +346,104 @@ class InlineWarpPipelineExecuteRegionPattern
   }
 };
 
+// Check if the wrap-around cluster barrier of a converted pipelined loop
+// includes a local memory fence (ttg.barrier local).  The wrap-around barrier
+// is the last cluster barrier emitted just before the scf.yield terminator:
+//   [s_setprio]  sched_barrier  ttg.barrier_local|s_barrier  sched_barrier  yield
+static bool hasLocalFenceAtWrapAround(scf::ForOp forOp) {
+  auto *yieldOp = forOp.getBody()->getTerminator();
+  if (!yieldOp)
+    return false;
+  Operation *op = yieldOp->getPrevNode();
+  if (!op || !isa<ROCDL::SchedBarrier>(op))
+    return false;
+  op = op->getPrevNode();
+  if (!op)
+    return false;
+  if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op))
+    return barrier.hasLocal();
+  return false;
+}
+
+// Eliminate redundant conditional barriers between consecutive warp-pipelined
+// loops.  When loop 1's wrap-around barrier already includes a local fence,
+// the phase shift naturally carries over into loop 2: the post-loop
+// reconverge and pre-loop phase shift cancel, and the intervening pre-barrier
+// is redundant because membar will not need to insert a barrier (the
+// wrap-around fence already resolved all pending LDS writes).
+//
+// Before:                              After:
+//   scf.for { loop 1 }                  scf.for { loop 1 }
+//   [s_setprio 0]                       [s_setprio 0]
+//   cond_barrier(warpLow)   ← erase    <thread ID arith> (dead, cleaned later)
+//   ttg.barrier local       ← erase    [s_setprio P]
+//   <thread ID arith>                   scf.for { loop 2 }
+//   cond_barrier(warpHigh)  ← erase
+//   [s_setprio P]
+//   scf.for { loop 2 }
+//
+static void eliminateRedundantCondBarriers(ModuleOp m) {
+  SmallVector<Operation *> toErase;
+
+  m.walk([&](triton::FuncOp funcOp) {
+    for (Block &block : funcOp.getBody()) {
+      SmallVector<triton::amdgpu::CondBarrierOp> condBarriers;
+      for (auto &op : block)
+        if (auto cb = dyn_cast<triton::amdgpu::CondBarrierOp>(&op))
+          condBarriers.push_back(cb);
+
+      for (size_t i = 0; i + 1 < condBarriers.size(); i++) {
+        auto postLoopCB = condBarriers[i];
+        auto preLoopCB = condBarriers[i + 1];
+
+        // The post-loop cond_barrier must be preceded by a scf.for
+        // (possibly with an intervening s_setprio reset).
+        Operation *prev = postLoopCB->getPrevNode();
+        if (prev && isa<ROCDL::SetPrioOp>(prev))
+          prev = prev->getPrevNode();
+        auto prevFor = dyn_cast_or_null<scf::ForOp>(prev);
+        if (!prevFor)
+          continue;
+
+        // The pre-loop cond_barrier must be followed by a scf.for
+        // (possibly with an intervening s_setprio).
+        Operation *next = preLoopCB->getNextNode();
+        if (next && isa<ROCDL::SetPrioOp>(next))
+          next = next->getNextNode();
+        if (!dyn_cast_or_null<scf::ForOp>(next))
+          continue;
+
+        if (!hasLocalFenceAtWrapAround(prevFor))
+          continue;
+
+        // Find the ttg.barrier local (pre-barrier) between the two
+        // cond_barriers.
+        triton::gpu::BarrierOp preBarrier = nullptr;
+        for (Operation *op = postLoopCB->getNextNode();
+             op && op != preLoopCB; op = op->getNextNode()) {
+          if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op)) {
+            if (barrier.hasLocal()) {
+              preBarrier = barrier;
+              break;
+            }
+          }
+        }
+        if (!preBarrier)
+          continue;
+
+        LDBG("eliminating redundant barriers between back-to-back loops");
+        toErase.push_back(postLoopCB);
+        toErase.push_back(preBarrier);
+        toErase.push_back(preLoopCB);
+        i++;
+      }
+    }
+  });
+
+  for (auto *op : llvm::reverse(toErase))
+    op->erase();
+}
+
 struct ConvertWarpPipeline
     : public mlir::triton::impl::ConvertWarpPipelineBase<ConvertWarpPipeline> {
 
@@ -381,6 +479,12 @@ struct ConvertWarpPipeline
 
     if (failed(applyPatternsGreedily(m, std::move(patternFor))))
       signalPassFailure();
+
+    // Must run after patternFor (all loops converted, barriers inserted) but
+    // before patternInline (inlining execute_regions would flatten the IR and
+    // obscure the scf.for ↔ cond_barrier adjacency we rely on).
+    eliminateRedundantCondBarriers(m);
+
     if (failed(applyPatternsGreedily(m, std::move(patternInline))))
       signalPassFailure();
   }
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
index f6d1d61db7a5..faae3e4e09e1 100644
--- a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
+++ b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
@@ -92,11 +92,13 @@ def gemm_tdm_pipelined_warp_pipelined_kernel(a_ptr, b_ptr, c_ptr,  #
                                    TRANSPOSE_B)
             accumulator = issue_wmma_compute(a, b, accumulator)
 
-    for i in ttgl.static_range(NUM_BUFFERS - 1):
-        # Warp-pipeline ended, wait for the ones to be consumed here.
+    for i in range(NUM_BUFFERS - 1):
+        with ttgl.amd.warp_pipeline_stage("stage0_epilogue", priority=1):
+            consumer, a, b = lds_load(consumer, a_buffer, OPERAND_LAYOUT_A, b_buffer, OPERAND_LAYOUT_B, NUM_BUFFERS,
+                                      TRANSPOSE_B)
         ttgl.amd.gfx1250.tdm.async_wait((NUM_BUFFERS - 1 - i) * 2)
-        consumer, accumulator = issue_wmma(consumer, a_buffer, OPERAND_LAYOUT_A, b_buffer, OPERAND_LAYOUT_B,
-                                           accumulator, (NUM_BUFFERS - 2 - i) * 2, NUM_BUFFERS, TRANSPOSE_B)
+        with ttgl.amd.warp_pipeline_stage("stage1_epilogue", priority=0):
+            accumulator = issue_wmma_compute(a, b, accumulator)
 
     offs_cm = pid_m * BLOCK_M + ttgl.arange(0, BLOCK_M, layout=ttgl.SliceLayout(1, WMMA_LAYOUT))
     offs_cn = pid_n * BLOCK_N + ttgl.arange(0, BLOCK_N, layout=ttgl.SliceLayout(0, WMMA_LAYOUT))

From c0bf9ee3419789591dabee462659d8fecaafbd02 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 5 Apr 2026 19:23:46 +0000
Subject: [PATCH 02/19] [AMD] Support warp-pipeline for unrolled (flat) loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extend the warp-pipeline infrastructure to handle loops unrolled at the
Python level (e.g. via static_range/ttgl.static_range).  Previously,
warp-pipelining only worked with scf.for loops.  Unrolled loops produce
flat sequences of border markers in the IR that were silently ignored.

Three main changes:

1. WarpPipeliner: add createFlatPipeline()
   Scans each block for triton.warp_pipeline.border markers outside
   scf.for.  Groups the operations between borders into clusters and
   wraps each in an scf.execute_region with triton.warp_pipeline.stage,
   triton.warp_pipeline.priority, and no_inline attributes — the same
   representation createPipeline() produces for loop bodies.

2. ConvertWarpPipeline: add processUnrolledPipelineRegions() + emitPipelinedFlat()
   After the existing patternFor converts scf.for loops, this new pass
   walks each function block for contiguous sequences of flat
   scf.execute_region ops (with triton.warp_pipeline.stage).  For each
   sequence it emits the full barrier structure: pre-barrier, phase
   shift (cond_barrier warpHigh), linear dependency analysis for cluster
   barriers (no wrap-around since the sequence is finite), priority
   management (s_setprio), and post-sequence reconverge (cond_barrier
   warpLow).  The execute_regions are then inlined by the existing
   InlineWarpPipelineExecuteRegionPattern.

   Also extends eliminateRedundantCondBarriers() to handle the case
   where a pipelined scf.for is immediately followed by a flat pipeline
   (instead of only scf.for → scf.for).  When the first loop's
   wrap-around barrier includes a local fence, the intervening
   reconverge + pre-barrier + phase-shift are redundant and eliminated.

3. Gluon frontend: assert warp_pipeline_stage is inside a for loop
   Since the compiler now supports flat border markers, there is a risk
   that users place warp_pipeline_stage outside any loop, which has no
   meaningful pipelining semantics.  A for_loop_depth counter is added
   to GluonSemantic and incremented/decremented in code_generator's
   visit_For (covering both range and static_range).  warp_pipeline_stage
   asserts for_loop_depth > 0 at exit.

The f16 GEMM example kernel is updated to use ttgl.static_range for the
epilogue loop, exercising the new flat pipeline path end-to-end.

Lit tests added for both WarpPipeliner (flat_pipeline_example) and
ConvertWarpPipeline (flat_pipeline_backend, back_to_back_for_then_flat).
---
 python/triton/compiler/code_generator.py      |   7 +
 .../experimental/gluon/language/_semantic.py  |   4 +
 .../gluon/language/amd/warp_pipeline.py       |   7 +
 .../amd/amd-convert-warp-pipeline.mlir        | 177 ++++++++++++++++
 test/TritonGPU/amd/amd-warp-pipeline.mlir     |  48 +++++
 .../ConvertWarpPipeline.cpp                   | 198 ++++++++++++++++--
 .../TritonAMDGPUTransforms/WarpPipeliner.cpp  | 101 +++++++++
 .../gluon/f16_gemm_warp_pipeline_gfx1250.py   |   2 +-
 8 files changed, 528 insertions(+), 16 deletions(-)

diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
index bd98a1951e02..4ed60506c7d9 100644
--- a/python/triton/compiler/code_generator.py
+++ b/python/triton/compiler/code_generator.py
@@ -1162,6 +1162,8 @@ def visit_For(self, node):
         IteratorClass = self.visit(node.iter.func)
         iter_args = [self.visit(arg) for arg in node.iter.args]
         iter_kwargs = dict(self.visit(keyword) for keyword in node.iter.keywords)
+        if hasattr(self.semantic, 'for_loop_depth'):
+            self.semantic.for_loop_depth += 1
         if IteratorClass == language.static_range:
             iterator = IteratorClass(*iter_args, **iter_kwargs)
             static_range = range(iterator.start.value, iterator.end.value, iterator.step.value)
@@ -1170,6 +1172,8 @@ def visit_For(self, node):
                 self.visit_compound_statement(node.body)
                 for stmt in node.orelse:
                     ast.NodeVisitor.generic_visit(self, stmt)
+            if hasattr(self.semantic, 'for_loop_depth'):
+                self.semantic.for_loop_depth -= 1
             return
         num_stages = None
         loop_unroll_factor = None
@@ -1295,6 +1299,9 @@ def visit_For(self, node):
             assert False, "Don't know what to do with else after for"
             ast.NodeVisitor.generic_visit(self, stmt)
 
+        if hasattr(self.semantic, 'for_loop_depth'):
+            self.semantic.for_loop_depth -= 1
+
     def visit_Slice(self, node):
         lower = self.visit(node.lower)
         upper = self.visit(node.upper)
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
index d1fad3c0753e..c7add748eb82 100644
--- a/python/triton/experimental/gluon/language/_semantic.py
+++ b/python/triton/experimental/gluon/language/_semantic.py
@@ -103,6 +103,10 @@ class GluonSemantic(TritonSemantic[TensorTy]):
 
     def __init__(self, builder: GluonOpBuilder):
         self.builder = builder
+        # Tracks Python-level for-loop nesting depth (both scf.for and
+        # static_range).  Used by warp_pipeline_stage to enforce that
+        # pipeline stages are only declared inside a loop.
+        self.for_loop_depth = 0
 
     def _wrap_handle_infer_layout(self, handle, scalar_ty, shape):
         if shape == []:
diff --git a/python/triton/experimental/gluon/language/amd/warp_pipeline.py b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
index e0b132e4620c..942080014744 100644
--- a/python/triton/experimental/gluon/language/amd/warp_pipeline.py
+++ b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
@@ -67,6 +67,13 @@ def __exit__(self, exc_type, exc, tb):
             return False
         if self._semantic is None:
             return False
+        # Warp-pipelining is a loop optimization: stages must be declared
+        # inside a for loop (range or static_range).  Allowing stages outside
+        # a loop would produce border markers with no well-defined iteration
+        # structure, breaking the phase-shift/reconvergence contract.
+        assert getattr(self._semantic, 'for_loop_depth', 0) > 0, (
+            "warp_pipeline_stage must be used inside a for loop "
+            "(range or static_range)")
         marker = self.label if self.label is not None else "cluster"
         prio = self.priority if self.priority is not None else -1
         self._semantic.builder.create_warp_pipeline_border(marker, prio)
diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index 0d17ebe02a83..c3f6c708c5bc 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -536,3 +536,180 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // Post-loop reconverge for loop 2 is kept.
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
+
+// -----
+
+// ---- Flat (unrolled) pipeline: execute_regions outside scf.for ----
+//
+// Simulates the output of WarpPipeliner::createFlatPipeline —
+// 4 execute_regions from a 2-iteration × 2-stage unrolled epilogue.
+// ConvertWarpPipeline should insert pre-barrier, phase shift,
+// cluster barriers, priority, and reconverge around them.
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @flat_pipeline_backend(%ptr0: !tt.ptr<f32>, %ptr1: !tt.ptr<f32>) {
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+    %v2 = arith.constant 2.0 : f32
+    %v3 = arith.constant 3.0 : f32
+
+    // Iteration 0, stage 0
+    scf.execute_region no_inline {
+      tt.store %ptr0, %v0 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage0_epi", triton.warp_pipeline.priority = 1 : i32}
+
+    // Iteration 0, stage 1
+    scf.execute_region no_inline {
+      tt.store %ptr1, %v1 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage1_epi", triton.warp_pipeline.priority = 0 : i32}
+
+    // Iteration 1, stage 0
+    scf.execute_region no_inline {
+      tt.store %ptr0, %v2 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage0_epi", triton.warp_pipeline.priority = 1 : i32}
+
+    // Iteration 1, stage 1
+    scf.execute_region no_inline {
+      tt.store %ptr1, %v3 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage1_epi", triton.warp_pipeline.priority = 0 : i32}
+
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @flat_pipeline_backend
+// All execute_regions must be inlined.
+// CHECK-NOT: no_inline
+//
+// Pre-barrier + phase shift.
+// CHECK: ttg.barrier local
+// CHECK: %[[WARPLOW:.+]] = arith.cmpi eq
+// CHECK: %[[WARPHIGH:.+]] = arith.cmpi ne
+// CHECK: amdg.cond_barrier %[[WARPHIGH]]
+//
+// Stage 0 priority.
+// CHECK: rocdl.s.setprio 1
+// Stage 0 ops (inlined).
+// CHECK: tt.store
+//
+// Cluster barrier between stages 0 and 1.
+// CHECK: rocdl.s.setprio 0
+// CHECK: rocdl.sched.barrier
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier
+// Stage 1 ops.
+// CHECK: tt.store
+//
+// Cluster barrier between iteration 0 stage 1 and iteration 1 stage 0.
+// CHECK: rocdl.s.setprio 1
+// CHECK: rocdl.sched.barrier
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier
+// CHECK: tt.store
+//
+// Cluster barrier between iteration 1 stages.
+// CHECK: rocdl.s.setprio 0
+// CHECK: rocdl.sched.barrier
+// CHECK: rocdl.s.barrier
+// CHECK: rocdl.sched.barrier
+// CHECK: tt.store
+//
+// Post-sequence priority reset + reconverge.
+// CHECK: rocdl.s.setprio 0
+// CHECK: amdg.cond_barrier %[[WARPLOW]]
+// CHECK: tt.return
+
+// -----
+
+// ---- Back-to-back: pipelined scf.for + flat (unrolled) pipeline ----
+//
+// Loop 1 (scf.for) has local_load in stage0 and local_store in stage1,
+// sharing the same LDS allocation → the wrap-around barrier includes a
+// ttg.barrier local.  The flat pipeline follows immediately.
+//
+// The post-loop reconverge of loop 1, the pre-barrier, and the phase
+// shift of the flat pipeline should all be eliminated — same logic as
+// back-to-back scf.for loops.
+//
+// Expected:
+//   ttg.barrier local          (pre-barrier for loop 1)
+//   amdg.cond_barrier          (#1 phase shift for loop 1)
+//   scf.for { loop 1 }
+//   NO amdg.cond_barrier       (#2 eliminated)
+//   NO ttg.barrier local       (pre-barrier eliminated)
+//   NO amdg.cond_barrier       (#3 eliminated)
+//   [flat pipeline stages]
+//   amdg.cond_barrier          (#4 reconverge for flat pipeline)
+
+#b2bf_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#b2bf_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#b2bf_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#b2bf_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @back_to_back_for_then_flat(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x256xf32, #b2bf_mma>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #b2bf_blocked>,
+      %sptr: !tt.ptr<f32>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    // Loop 1: local_load + local_store → wrap-around is ttg.barrier local
+    %r1:2 = scf.for %i = %lb to %ub step %step
+        iter_args(%a1 = %acc, %s1 = %smem)
+        -> (tensor<256x256xf32, #b2bf_mma>, !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>) : i32 {
+      %ld1 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bf_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %s1[0, 0] : !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2bf_shared, #b2bf_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #b2bf_shared, #b2bf_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bf_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bf_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      %st1 = scf.execute_region -> !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable> no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #b2bf_blocked>
+        ttg.local_store %data, %s1 : tensor<256x64xf16, #b2bf_blocked> -> !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>
+        scf.yield %s1 : !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>
+      } {triton.warp_pipeline.stage = "global_load_and_store"}
+
+      scf.yield %a1, %st1 : tensor<256x256xf32, #b2bf_mma>, !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Flat (unrolled) pipeline: 2 stages, simple stores (no LDS dep)
+    scf.execute_region no_inline {
+      tt.store %sptr, %v0 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "flat_stage0"}
+
+    scf.execute_region no_inline {
+      tt.store %sptr, %v1 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "flat_stage1"}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #b2bf_shared, #b2bf_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @back_to_back_for_then_flat
+// Pre-barrier and phase shift for loop 1 are kept.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Wrap-around barrier inside loop 1 (local fence from LDS dependency).
+// CHECK: ttg.barrier local
+// CHECK: scf.yield
+// Between loop 1 and flat pipeline: no cond_barriers, no ttg.barrier local.
+// CHECK-NOT: amdg.cond_barrier
+// CHECK-NOT: ttg.barrier local
+// Flat pipeline stages (inlined after conversion).
+// CHECK: tt.store
+// CHECK: rocdl.s.barrier
+// CHECK: tt.store
+// Reconverge for flat pipeline is kept.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
diff --git a/test/TritonGPU/amd/amd-warp-pipeline.mlir b/test/TritonGPU/amd/amd-warp-pipeline.mlir
index 1be47b58fc1e..fee2da41f878 100644
--- a/test/TritonGPU/amd/amd-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-warp-pipeline.mlir
@@ -143,6 +143,54 @@ tt.func public @triple_buf_two_stages(%arg0: i32, %arg1: i32, %arg2: i32, %arg3:
   tt.return
 }
 
+// -- Flat (unrolled) pipeline: borders outside scf.for ----
+//
+// Simulates a static_range epilogue that was unrolled at the Python level.
+// The border markers sit in the function body, not inside a loop.
+
+tt.func @flat_pipeline_example(%n: index) {
+  %c0  = arith.constant 0 : index
+  %c1  = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    scf.yield
+  }
+
+  // Stage 0 (ops before the first border)
+  %a  = arith.addi %c0, %c1 : index
+  %a2 = arith.muli %a, %c1 : index
+
+  rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage0_epi", triton.warp_pipeline.priority = 1 : i32}
+
+  // Stage 1
+  %b  = arith.addi %a2, %c0 : index
+  %b2 = arith.muli %b, %c1 : index
+
+  rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage1_epi", triton.warp_pipeline.priority = 0 : i32}
+
+  tt.return
+}
+
+// CHECK-LABEL: tt.func @flat_pipeline_example(
+// CHECK: scf.for
+// Flat execute_regions created from the borders:
+// CHECK: scf.execute_region
+// CHECK:   arith.addi
+// CHECK:   arith.muli
+// CHECK:   scf.yield
+// CHECK: triton.warp_pipeline.priority = 1
+// CHECK-SAME: triton.warp_pipeline.stage = "stage0_epi"
+// CHECK: scf.execute_region
+// CHECK:   arith.addi
+// CHECK:   arith.muli
+// CHECK:   scf.yield
+// CHECK: triton.warp_pipeline.priority = 0
+// CHECK-SAME: triton.warp_pipeline.stage = "stage1_epi"
+// Border markers must be erased:
+// CHECK-NOT: rocdl.sched.barrier
+// CHECK: tt.return
+
+
 // -- Negative: no border → no structuring ----
 tt.func @no_split_example(%n: index) {
   %c0  = arith.constant 0 : index
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 19035daa707a..34da6e3f1c33 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -80,7 +80,7 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
   return info;
 }
 
-static void emitClusterBarrier(PatternRewriter &r, Location loc,
+static void emitClusterBarrier(OpBuilder &r, Location loc,
                                bool needLocal) {
   ROCDL::SchedBarrier::create(r, loc, 0);
   if (needLocal)
@@ -90,7 +90,7 @@ static void emitClusterBarrier(PatternRewriter &r, Location loc,
   ROCDL::SchedBarrier::create(r, loc, 0);
 }
 
-static void emitClusterPriority(PatternRewriter &r, Location loc,
+static void emitClusterPriority(OpBuilder &r, Location loc,
                                 Operation *clusterOp, bool anyHasPriority) {
   if (auto intAttr = clusterOp->getAttrOfType<IntegerAttr>(
           "triton.warp_pipeline.priority")) {
@@ -346,6 +346,161 @@ class InlineWarpPipelineExecuteRegionPattern
   }
 };
 
+// Process a flat (non-loop) sequence of warp-pipeline execute_regions.
+// Unlike the loop case there is no wrap-around: dependencies are strictly
+// linear from the first stage to the last.
+//
+// Emitted IR:
+//   ttg.barrier local               (pre-barrier)
+//   <thread ID arith>
+//   cond_barrier(warpHigh)           (phase shift)
+//   [s_setprio P0]
+//   execute_region { stage 0 }
+//   [s_setprio P1]  sched+barrier    (cluster barrier)
+//   execute_region { stage 1 }
+//   ...
+//   [s_setprio 0]
+//   cond_barrier(warpLow)            (reconverge)
+//
+static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
+                              Allocation *allocation,
+                              int threadsPerPipelineGroup) {
+  Location loc = clusterOps.front().getLoc();
+  OpBuilder b(clusterOps.front().getContext());
+  int numClusters = clusterOps.size();
+
+  // 1. Pre-barrier and phase shift before the first execute_region.
+  b.setInsertionPoint(clusterOps.front());
+
+  mlir::triton::gpu::BarrierOp::create(b, loc,
+                                       triton::gpu::AddrSpace::Local);
+
+  auto i32ty = b.getIntegerType(32);
+  auto workIDX = ROCDL::ThreadIdXOp::create(b, loc, i32ty);
+  auto constZero = arith::ConstantIntOp::create(b, loc, 0, 32);
+  auto constWarpSize =
+      arith::ConstantIntOp::create(b, loc, threadsPerPipelineGroup, 32);
+  auto warpIDX = arith::DivSIOp::create(b, loc, workIDX, constWarpSize);
+  auto warpLow = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
+                                       warpIDX, constZero);
+  auto warpHigh = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::ne,
+                                        warpIDX, constZero);
+  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpHigh);
+
+  // 2. Dependency analysis — linear, no wrap-around.
+  SmallVector<Block *> clusterBlocks;
+  SmallVector<bool> bars(numClusters, false);
+
+  for (auto exec : clusterOps) {
+    exec.setNoInline(false);
+    clusterBlocks.push_back(&exec->getRegion(0).front());
+  }
+
+  SmallVector<BlockInfo> clusterInfo;
+  for (auto *cb : clusterBlocks)
+    clusterInfo.push_back(buildBlockInfoFromBlock(cb, allocation));
+
+  bool anyHasPriority = llvm::any_of(
+      clusterOps,
+      [](scf::ExecuteRegionOp op) {
+        return op->hasAttr("triton.warp_pipeline.priority");
+      });
+
+  for (int offset = 0; offset < numClusters; offset++) {
+    for (int src = 0; src < numClusters; src++) {
+      const int next = src + 2 + offset;
+      const int barrierLoc = src + 1 + offset;
+      if (next >= numClusters || barrierLoc >= numClusters)
+        continue;
+
+      auto isSynced = [&]() -> bool {
+        for (int idx = src + 1; idx <= barrierLoc; idx++)
+          if (bars[idx])
+            return true;
+        return false;
+      };
+      if (isSynced())
+        continue;
+
+      const bool needFence = clusterInfo[src].isIntersected(
+          clusterInfo[next], mlir::triton::AMD::membarFilter, allocation);
+      if (needFence) {
+        bars[barrierLoc] = true;
+        LDBG("flat cluster " << src << " need fence to " << next
+                             << " placing barrier at " << barrierLoc);
+      }
+    }
+  }
+
+  // 3. Materialize cluster barriers.
+  //    Cluster 0 gets only its priority (inserted after cond_barrier above).
+  //    Clusters 1..N get priority + cluster barrier.
+  emitClusterPriority(b, loc, clusterOps[0], anyHasPriority);
+
+  for (int i = 1; i < numClusters; i++) {
+    b.setInsertionPoint(clusterOps[i]);
+    emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
+    emitClusterBarrier(b, loc, /*needLocal=*/bars[i]);
+  }
+
+  // 4. Post-sequence reconverge.
+  b.setInsertionPointAfter(clusterOps.back());
+  if (anyHasPriority)
+    ROCDL::SetPrioOp::create(b, loc, 0);
+  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpLow);
+}
+
+// Walk the module for flat warp-pipeline execute_region sequences
+// (produced by WarpPipeliner::createFlatPipeline) and emit phase-shift
+// barriers around them.
+static void processUnrolledPipelineRegions(ModuleOp m,
+                                           ModuleAllocation &moduleAllocation,
+                                           int threadsPerPipelineGroup) {
+  auto isIgnorable = [](Operation *op) {
+    return isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
+               triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
+               triton::amdgpu::AsyncTDMIntrinsicWait>(op);
+  };
+
+  m.walk([&](triton::FuncOp funcOp) {
+    Allocation *allocation = moduleAllocation.getFuncData(funcOp);
+    if (!allocation)
+      return;
+
+    for (Block &block : funcOp.getBody()) {
+      // Collect contiguous sequences of flat warp-pipeline execute_regions,
+      // splitting at any non-ignorable, non-pipeline op.
+      SmallVector<SmallVector<scf::ExecuteRegionOp>> sequences;
+      SmallVector<scf::ExecuteRegionOp> current;
+
+      for (auto &op : block) {
+        if (auto exec = dyn_cast<scf::ExecuteRegionOp>(&op)) {
+          if (exec->hasAttr("triton.warp_pipeline.stage") &&
+              !isa<scf::ForOp>(exec->getParentOp())) {
+            current.push_back(exec);
+            continue;
+          }
+        }
+        if (isIgnorable(&op))
+          continue;
+        if (!current.empty()) {
+          sequences.push_back(std::move(current));
+          current.clear();
+        }
+      }
+      if (!current.empty())
+        sequences.push_back(std::move(current));
+
+      for (auto &seq : sequences) {
+        if (seq.size() < 2)
+          continue;
+        LDBG("processing flat pipeline with " << seq.size() << " stages");
+        emitPipelinedFlat(seq, allocation, threadsPerPipelineGroup);
+      }
+    }
+  });
+}
+
 // Check if the wrap-around cluster barrier of a converted pipelined loop
 // includes a local memory fence (ttg.barrier local).  The wrap-around barrier
 // is the last cluster barrier emitted just before the scf.yield terminator:
@@ -366,21 +521,24 @@ static bool hasLocalFenceAtWrapAround(scf::ForOp forOp) {
 }
 
 // Eliminate redundant conditional barriers between consecutive warp-pipelined
-// loops.  When loop 1's wrap-around barrier already includes a local fence,
-// the phase shift naturally carries over into loop 2: the post-loop
-// reconverge and pre-loop phase shift cancel, and the intervening pre-barrier
-// is redundant because membar will not need to insert a barrier (the
-// wrap-around fence already resolved all pending LDS writes).
+// regions.  When loop 1's wrap-around barrier already includes a local fence,
+// the phase shift naturally carries over into the next pipeline: the post-loop
+// reconverge and pre-pipeline phase shift cancel, and the intervening
+// pre-barrier is redundant because membar will not need to insert a barrier
+// (the wrap-around fence already resolved all pending LDS writes).
+//
+// The "next pipeline" can be either another scf.for or a flat (unrolled)
+// pipeline represented as a sequence of scf.execute_region ops.
 //
 // Before:                              After:
 //   scf.for { loop 1 }                  scf.for { loop 1 }
 //   [s_setprio 0]                       [s_setprio 0]
 //   cond_barrier(warpLow)   ← erase    <thread ID arith> (dead, cleaned later)
 //   ttg.barrier local       ← erase    [s_setprio P]
-//   <thread ID arith>                   scf.for { loop 2 }
+//   <thread ID arith>                   scf.for / execute_region { pipeline 2 }
 //   cond_barrier(warpHigh)  ← erase
 //   [s_setprio P]
-//   scf.for { loop 2 }
+//   scf.for / execute_region { pipeline 2 }
 //
 static void eliminateRedundantCondBarriers(ModuleOp m) {
   SmallVector<Operation *> toErase;
@@ -405,12 +563,17 @@ static void eliminateRedundantCondBarriers(ModuleOp m) {
         if (!prevFor)
           continue;
 
-        // The pre-loop cond_barrier must be followed by a scf.for
-        // (possibly with an intervening s_setprio).
+        // The pre-loop cond_barrier must be followed by a warp-pipelined
+        // scf.for or a flat pipeline execute_region (possibly with an
+        // intervening s_setprio).
         Operation *next = preLoopCB->getNextNode();
         if (next && isa<ROCDL::SetPrioOp>(next))
           next = next->getNextNode();
-        if (!dyn_cast_or_null<scf::ForOp>(next))
+        bool nextIsPipeline =
+            isa_and_nonnull<scf::ForOp>(next) ||
+            (isa_and_nonnull<scf::ExecuteRegionOp>(next) &&
+             next->hasAttr("triton.warp_pipeline.stage"));
+        if (!nextIsPipeline)
           continue;
 
         if (!hasLocalFenceAtWrapAround(prevFor))
@@ -480,9 +643,14 @@ struct ConvertWarpPipeline
     if (failed(applyPatternsGreedily(m, std::move(patternFor))))
       signalPassFailure();
 
-    // Must run after patternFor (all loops converted, barriers inserted) but
-    // before patternInline (inlining execute_regions would flatten the IR and
-    // obscure the scf.for ↔ cond_barrier adjacency we rely on).
+    // Flat (unrolled) pipeline regions are still wrapped in execute_regions
+    // with no_inline=true from WarpPipeliner.  Process them before inlining.
+    processUnrolledPipelineRegions(m, moduleAllocation,
+                                  threadsPerPipelineGroup);
+
+    // Must run after patternFor and flat processing (all regions converted,
+    // barriers inserted) but before patternInline (inlining execute_regions
+    // would flatten the IR and obscure the cond_barrier adjacency we rely on).
     eliminateRedundantCondBarriers(m);
 
     if (failed(applyPatternsGreedily(m, std::move(patternInline))))
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index 71ab36278428..2eb0f6b0c573 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -193,6 +193,101 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
   return success();
 }
 
+// Create a pipelined region from flat (non-loop) border markers in a block.
+// This handles the case where a loop was unrolled at the Python level
+// (e.g. via static_range) but the body still has warp_pipeline_stage
+// annotations producing border markers.  The grouping logic mirrors
+// createPipeline but without a loop wrapper.
+static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
+  auto isIgnorable = [](Operation *op) {
+    return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
+               tt::amdgpu::AsyncTDMWait>(op);
+  };
+  auto isBorder = [](Operation *op) {
+    return op->hasAttr("triton.warp_pipeline.border");
+  };
+
+  SmallVector<Operation *> allBorders;
+  for (auto &op : block)
+    if (isBorder(&op))
+      allBorders.push_back(&op);
+
+  if (allBorders.size() < 2)
+    return failure();
+
+  Location loc = allBorders.front()->getLoc();
+  Operation *firstBorder = allBorders.front();
+  Operation *lastBorder = allBorders.back();
+
+  // Walk backwards from the first border to find the start of the first
+  // stage.  Stop at control-flow boundaries (scf.for, cond_barrier) or
+  // ignorable ops that logically belong to a previous pipeline.
+  Operation *regionStart = firstBorder;
+  for (Operation *op = firstBorder->getPrevNode(); op;
+       op = op->getPrevNode()) {
+    if (isa<scf::ForOp>(op) || isa<tt::amdgpu::CondBarrierOp>(op))
+      break;
+    if (isIgnorable(op))
+      break;
+    regionStart = op;
+  }
+
+  SmallVector<Operation *> cluster;
+  SmallVector<std::pair<StringAttr, int>> clusterMarkers;
+  SmallVector<SmallVector<Operation *>> clusters;
+
+  for (auto it = Block::iterator(regionStart); it != block.end();) {
+    Operation *op = &*it;
+    ++it;
+
+    if (isBorder(op)) {
+      StringAttr clusterStr =
+          op->getAttrOfType<StringAttr>("triton.warp_pipeline.border");
+      int priority = -1;
+      if (auto intAttr =
+              op->getAttrOfType<IntegerAttr>("triton.warp_pipeline.priority"))
+        priority = intAttr.getInt();
+      clusterMarkers.push_back({clusterStr, priority});
+
+      if (cluster.empty()) {
+        b.setInsertionPoint(op);
+        auto dummyOp = ROCDL::SchedBarrier::create(b, loc, 0);
+        dummyOp->setAttr("triton.warp_pipeline.empty_cluster",
+                         b.getUnitAttr());
+        cluster.push_back(dummyOp);
+      }
+      clusters.push_back(std::move(cluster));
+      cluster.clear();
+
+      bool isLast = (op == lastBorder);
+      op->erase();
+      if (isLast)
+        break;
+      continue;
+    }
+
+    if (isIgnorable(op)) {
+      if (!cluster.empty())
+        return failure();
+      continue;
+    }
+
+    cluster.push_back(op);
+  }
+
+  if (clusters.size() < 2)
+    return failure();
+
+  for (auto &&[stageOps, marker] : llvm::zip(clusters, clusterMarkers)) {
+    if (stageOps.empty())
+      continue;
+    createClusterOp(b, loc, stageOps, marker);
+  }
+
+  LDBG("[warp-pipeline] flat pipeline with " << clusters.size() << " stages");
+  return success();
+}
+
 struct TritonAMDGPUWarpPipelinePass
     : impl::TritonAMDGPUWarpPipelineBase<TritonAMDGPUWarpPipelinePass> {
   using Base::Base;
@@ -206,6 +301,12 @@ struct TritonAMDGPUWarpPipelinePass
         if (createPipeline(builder, loc, forOp).failed())
           LDBG("Failed warp-pipelining");
       });
+
+      // Process remaining border markers in flat (non-loop) code.
+      for (Block &block : funcOp.getBody()) {
+        if (createFlatPipeline(builder, block).failed())
+          LDBG("No flat warp-pipeline in block");
+      }
     }
   }
 };
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
index faae3e4e09e1..cce84b23ddb3 100644
--- a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
+++ b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
@@ -92,7 +92,7 @@ def gemm_tdm_pipelined_warp_pipelined_kernel(a_ptr, b_ptr, c_ptr,  #
                                    TRANSPOSE_B)
             accumulator = issue_wmma_compute(a, b, accumulator)
 
-    for i in range(NUM_BUFFERS - 1):
+    for i in ttgl.static_range(NUM_BUFFERS - 1):
         with ttgl.amd.warp_pipeline_stage("stage0_epilogue", priority=1):
             consumer, a, b = lds_load(consumer, a_buffer, OPERAND_LAYOUT_A, b_buffer, OPERAND_LAYOUT_B, NUM_BUFFERS,
                                       TRANSPOSE_B)

From bdf607602d86f66a0ae015605b692c0910a6ae15 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 5 Apr 2026 19:41:10 +0000
Subject: [PATCH 03/19] [AMD] Refactor: extract emitPipelinePrelude/Postlude
 helpers

Factor out the duplicated pre-barrier + phase-shift setup and the
post-pipeline reconverge logic from emitPipelinedFor and
emitPipelinedFlat into shared helpers emitPipelinePrelude and
emitPipelinePostlude.  NFC.
---
 .../ConvertWarpPipeline.cpp                   | 80 +++++++++----------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 34da6e3f1c33..de9f70e4c07d 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -101,6 +101,35 @@ static void emitClusterPriority(OpBuilder &r, Location loc,
   }
 }
 
+// Emit pre-barrier, thread-ID partitioning, and phase-shift cond_barrier.
+// Returns warpLow (for reconverge) and warpHigh (consumed by phase shift).
+static std::pair<Value, Value>
+emitPipelinePrelude(OpBuilder &b, Location loc, int threadsPerPipelineGroup) {
+  mlir::triton::gpu::BarrierOp::create(b, loc, triton::gpu::AddrSpace::Local);
+
+  auto i32ty = b.getIntegerType(32);
+  auto workIDX = ROCDL::ThreadIdXOp::create(b, loc, i32ty);
+  auto constZero = arith::ConstantIntOp::create(b, loc, 0, 32);
+  auto constWarpSize =
+      arith::ConstantIntOp::create(b, loc, threadsPerPipelineGroup, 32);
+  auto warpIDX = arith::DivSIOp::create(b, loc, workIDX, constWarpSize);
+  auto warpLow = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
+                                       warpIDX, constZero);
+  auto warpHigh = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::ne,
+                                        warpIDX, constZero);
+  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpHigh);
+
+  return {warpLow, warpHigh};
+}
+
+// Emit priority reset and reconverge cond_barrier after a pipeline.
+static void emitPipelinePostlude(OpBuilder &b, Location loc,
+                                 bool anyHasPriority, Value warpLow) {
+  if (anyHasPriority)
+    ROCDL::SetPrioOp::create(b, loc, 0);
+  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpLow);
+}
+
 class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
 public:
   ConvertPipelinedForPattern(MLIRContext *ctx, ModuleAllocation &moduleAlloc,
@@ -133,28 +162,10 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
   LogicalResult emitPipelinedFor(PatternRewriter &b, Location loc,
                                  scf::ForOp forOp, Allocation *allocation,
                                  int threadsPerPipelineGroup) const {
-    // 1. Insert conditional branch first,
+    // 1. Pre-barrier, thread partitioning, and phase shift.
     b.setInsertionPoint(forOp);
-    // Set barrier before starting the loop. This resolves any outstanding
-    // synchronization before beginning the specialized asymmetric
-    // synchronization.
-    auto preBarrier = mlir::triton::gpu::BarrierOp::create(
-        b, loc, triton::gpu::AddrSpace::Local);
-
-    // Insert condbarrier::second_half before starting the loop
-    // FIXME : correctly calculate numbers per the arch
-    auto i32ty = b.getIntegerType(32);
-    auto workIDX = ROCDL::ThreadIdXOp::create(b, loc, i32ty);
-    auto constZero = arith::ConstantIntOp::create(b, loc, 0, 32);
-    auto constWarpSize =
-        arith::ConstantIntOp::create(b, loc, threadsPerPipelineGroup, 32);
-    auto warpIDX = arith::DivSIOp::create(b, loc, workIDX, constWarpSize);
-    auto warpLow = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
-                                         warpIDX, constZero);
-    auto warpHigh = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::ne,
-                                          warpIDX, constZero);
-
-    mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpHigh);
+    auto [warpLow, warpHigh] =
+        emitPipelinePrelude(b, loc, threadsPerPipelineGroup);
 
     // 2. Collect existing barrier information.
     // Scanning the loop body and classifying each consecutive block of
@@ -296,11 +307,9 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
       }
     }
 
-    // Insert condbarrier and priority reset after the loop.
+    // Post-loop priority reset and reconverge.
     b.setInsertionPointAfter(forOp);
-    if (anyHasPriority)
-      ROCDL::SetPrioOp::create(b, loc, 0);
-    mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpLow);
+    emitPipelinePostlude(b, loc, anyHasPriority, warpLow);
     return success();
   }
 
@@ -371,21 +380,8 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
 
   // 1. Pre-barrier and phase shift before the first execute_region.
   b.setInsertionPoint(clusterOps.front());
-
-  mlir::triton::gpu::BarrierOp::create(b, loc,
-                                       triton::gpu::AddrSpace::Local);
-
-  auto i32ty = b.getIntegerType(32);
-  auto workIDX = ROCDL::ThreadIdXOp::create(b, loc, i32ty);
-  auto constZero = arith::ConstantIntOp::create(b, loc, 0, 32);
-  auto constWarpSize =
-      arith::ConstantIntOp::create(b, loc, threadsPerPipelineGroup, 32);
-  auto warpIDX = arith::DivSIOp::create(b, loc, workIDX, constWarpSize);
-  auto warpLow = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::eq,
-                                       warpIDX, constZero);
-  auto warpHigh = arith::CmpIOp::create(b, loc, arith::CmpIPredicate::ne,
-                                        warpIDX, constZero);
-  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpHigh);
+  auto [warpLow, warpHigh] =
+      emitPipelinePrelude(b, loc, threadsPerPipelineGroup);
 
   // 2. Dependency analysis — linear, no wrap-around.
   SmallVector<Block *> clusterBlocks;
@@ -445,9 +441,7 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
 
   // 4. Post-sequence reconverge.
   b.setInsertionPointAfter(clusterOps.back());
-  if (anyHasPriority)
-    ROCDL::SetPrioOp::create(b, loc, 0);
-  mlir::triton::amdgpu::CondBarrierOp::create(b, loc, warpLow);
+  emitPipelinePostlude(b, loc, anyHasPriority, warpLow);
 }
 
 // Walk the module for flat warp-pipeline execute_region sequences

From b5a9e4e1af26991e04292f2385250f2bcf4151a0 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 5 Apr 2026 20:10:44 +0000
Subject: [PATCH 04/19] [AMD] Refactor: extract analyzePipelineDependencies
 helper

Unify the duplicated pairwise dependency analysis from emitPipelinedFor
(circular/wrap-around) and emitPipelinedFlat (linear) into a single
analyzePipelineDependencies function parameterized by `bool circular`.
NFC.
---
 .../ConvertWarpPipeline.cpp                   | 127 +++++++++---------
 1 file changed, 61 insertions(+), 66 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index de9f70e4c07d..4f455035d9dc 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -80,6 +80,61 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
   return info;
 }
 
+// Pairwise dependency analysis between pipeline clusters.
+// For each src → next pair, checks whether their memory intervals overlap.
+// If so, marks `bars[barrierLoc] = true` to indicate a fence is needed.
+//
+// When `circular` is true (loop pipelines), indices wrap around modulo
+// numClusters so that the last cluster feeds back to the first.
+// When false (flat pipelines), indices are strictly linear.
+static void analyzePipelineDependencies(ArrayRef<BlockInfo> clusterInfo,
+                                        SmallVectorImpl<bool> &bars,
+                                        Allocation *allocation,
+                                        bool circular) {
+  int numClusters = clusterInfo.size();
+  for (int offset = 0; offset < numClusters; offset++) {
+    for (int src = 0; src < numClusters; src++) {
+      int next, barrierLoc;
+      if (circular) {
+        next = (src + 2 + offset) % numClusters;
+        barrierLoc = (src + 1 + offset) % numClusters;
+      } else {
+        next = src + 2 + offset;
+        barrierLoc = src + 1 + offset;
+        if (next >= numClusters || barrierLoc >= numClusters)
+          continue;
+      }
+
+      auto isSynced = [&]() -> bool {
+        if (circular) {
+          for (int idx = (src + 1) % numClusters; idx != src;
+               idx = (idx + 1) % numClusters) {
+            if (bars[idx])
+              return true;
+            if (idx == barrierLoc)
+              break;
+          }
+        } else {
+          for (int idx = src + 1; idx <= barrierLoc; idx++)
+            if (bars[idx])
+              return true;
+        }
+        return false;
+      };
+      if (isSynced())
+        continue;
+
+      const bool needFence = clusterInfo[src].isIntersected(
+          clusterInfo[next], mlir::triton::AMD::membarFilter, allocation);
+      if (needFence) {
+        bars[barrierLoc] = true;
+        LDBG("cluster " << src << " need fence to " << next
+                        << " placing barrier at " << barrierLoc);
+      }
+    }
+  }
+}
+
 static void emitClusterBarrier(OpBuilder &r, Location loc,
                                bool needLocal) {
   ROCDL::SchedBarrier::create(r, loc, 0);
@@ -230,47 +285,9 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
       existingBarrierMap.erase(bottomBar);
     }
 
-    // 3. Performing pairwise dependency analysis between clusters.  For each
-    // src → next pair (with wrap-around), we check whether their memory
-    // intervals overlap.  If so, a fence/barrier must be inserted at the
-    // boundary cluster (barrierLoc).  The analysis is expressed as a
-    // circular traversal so that pipeline stages form a ring.
-    // • `bars[i] = true` marks that a new cluster barrier must be inserted
-    //   before cluster i.
-    // • Existing barriers override or satisfy required fences, so we do not
-    //   insert duplicates.
-    for (int offset = 0; offset < numClusters; offset++) {
-      for (int src = 0; src < numClusters; src++) {
-        const int next = (src + 2 + offset) % numClusters;
-        const int barrierLoc = (src + 1 + offset) % numClusters;
-        LDBG("Inspecting src:" << src << " to next:" << next);
-        // Check if any existing barrier sits between src and barrierIdx
-        auto isSynced = [&]() -> bool {
-          for (int idx = (src + 1) % numClusters; idx != src;
-               idx = (idx + 1) % numClusters) {
-            if (bars[idx])
-              return true;
-            if (idx == barrierLoc)
-              break;
-          }
-          return false;
-        };
-        // Skip if dependency is already resolved.
-        if (isSynced()) {
-          LDBG("already synced");
-          continue;
-        }
-        const bool needFence = clusterInfo[src].isIntersected(
-            clusterInfo[next], mlir::triton::AMD::membarFilter, allocation);
-        // insert fence/barrier in front of this cluster
-        LDBG("need fence?: " << needFence);
-        if (needFence) {
-          bars[barrierLoc] = true;
-          LDBG("cluster " << src << " need fence to " << next
-                          << " placing barrier at " << barrierLoc);
-        }
-      }
-    }
+    // 3. Circular dependency analysis (wrap-around for loop pipelines).
+    analyzePipelineDependencies(clusterInfo, bars, allocation,
+                                /*circular=*/true);
 
     // 4. Materializing final cluster-scope barriers.  For each cluster index:
     //  • If there is a pre-existing barrier at that location, we wrap it with
@@ -402,31 +419,9 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
         return op->hasAttr("triton.warp_pipeline.priority");
       });
 
-  for (int offset = 0; offset < numClusters; offset++) {
-    for (int src = 0; src < numClusters; src++) {
-      const int next = src + 2 + offset;
-      const int barrierLoc = src + 1 + offset;
-      if (next >= numClusters || barrierLoc >= numClusters)
-        continue;
-
-      auto isSynced = [&]() -> bool {
-        for (int idx = src + 1; idx <= barrierLoc; idx++)
-          if (bars[idx])
-            return true;
-        return false;
-      };
-      if (isSynced())
-        continue;
-
-      const bool needFence = clusterInfo[src].isIntersected(
-          clusterInfo[next], mlir::triton::AMD::membarFilter, allocation);
-      if (needFence) {
-        bars[barrierLoc] = true;
-        LDBG("flat cluster " << src << " need fence to " << next
-                             << " placing barrier at " << barrierLoc);
-      }
-    }
-  }
+  // Linear dependency analysis (no wrap-around for flat pipelines).
+  analyzePipelineDependencies(clusterInfo, bars, allocation,
+                              /*circular=*/false);
 
   // 3. Materialize cluster barriers.
   //    Cluster 0 gets only its priority (inserted after cond_barrier above).

From 401e13073d1825eec04008d9ac94dcf436934993 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 5 Apr 2026 21:38:40 +0000
Subject: [PATCH 05/19] Format

---
 .../gluon/language/amd/warp_pipeline.py       |  5 ++--
 .../ConvertWarpPipeline.cpp                   | 30 ++++++++-----------
 .../TritonAMDGPUTransforms/WarpPipeliner.cpp  |  6 ++--
 .../gluon/f16_gemm_warp_pipeline_gfx1250.py   |  2 --
 4 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/python/triton/experimental/gluon/language/amd/warp_pipeline.py b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
index 942080014744..63b6b3f5bce1 100644
--- a/python/triton/experimental/gluon/language/amd/warp_pipeline.py
+++ b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
@@ -71,9 +71,8 @@ def __exit__(self, exc_type, exc, tb):
         # inside a for loop (range or static_range).  Allowing stages outside
         # a loop would produce border markers with no well-defined iteration
         # structure, breaking the phase-shift/reconvergence contract.
-        assert getattr(self._semantic, 'for_loop_depth', 0) > 0, (
-            "warp_pipeline_stage must be used inside a for loop "
-            "(range or static_range)")
+        assert getattr(self._semantic, 'for_loop_depth', 0) > 0, ("warp_pipeline_stage must be used inside a for loop "
+                                                                  "(range or static_range)")
         marker = self.label if self.label is not None else "cluster"
         prio = self.priority if self.priority is not None else -1
         self._semantic.builder.create_warp_pipeline_border(marker, prio)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 4f455035d9dc..41023d6f53f0 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -89,8 +89,7 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
 // When false (flat pipelines), indices are strictly linear.
 static void analyzePipelineDependencies(ArrayRef<BlockInfo> clusterInfo,
                                         SmallVectorImpl<bool> &bars,
-                                        Allocation *allocation,
-                                        bool circular) {
+                                        Allocation *allocation, bool circular) {
   int numClusters = clusterInfo.size();
   for (int offset = 0; offset < numClusters; offset++) {
     for (int src = 0; src < numClusters; src++) {
@@ -135,8 +134,7 @@ static void analyzePipelineDependencies(ArrayRef<BlockInfo> clusterInfo,
   }
 }
 
-static void emitClusterBarrier(OpBuilder &r, Location loc,
-                               bool needLocal) {
+static void emitClusterBarrier(OpBuilder &r, Location loc, bool needLocal) {
   ROCDL::SchedBarrier::create(r, loc, 0);
   if (needLocal)
     mlir::triton::gpu::BarrierOp::create(r, loc, triton::gpu::AddrSpace::Local);
@@ -413,11 +411,9 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
   for (auto *cb : clusterBlocks)
     clusterInfo.push_back(buildBlockInfoFromBlock(cb, allocation));
 
-  bool anyHasPriority = llvm::any_of(
-      clusterOps,
-      [](scf::ExecuteRegionOp op) {
-        return op->hasAttr("triton.warp_pipeline.priority");
-      });
+  bool anyHasPriority = llvm::any_of(clusterOps, [](scf::ExecuteRegionOp op) {
+    return op->hasAttr("triton.warp_pipeline.priority");
+  });
 
   // Linear dependency analysis (no wrap-around for flat pipelines).
   analyzePipelineDependencies(clusterInfo, bars, allocation,
@@ -493,7 +489,8 @@ static void processUnrolledPipelineRegions(ModuleOp m,
 // Check if the wrap-around cluster barrier of a converted pipelined loop
 // includes a local memory fence (ttg.barrier local).  The wrap-around barrier
 // is the last cluster barrier emitted just before the scf.yield terminator:
-//   [s_setprio]  sched_barrier  ttg.barrier_local|s_barrier  sched_barrier  yield
+//   [s_setprio]  sched_barrier  ttg.barrier_local|s_barrier  sched_barrier
+//   yield
 static bool hasLocalFenceAtWrapAround(scf::ForOp forOp) {
   auto *yieldOp = forOp.getBody()->getTerminator();
   if (!yieldOp)
@@ -558,10 +555,9 @@ static void eliminateRedundantCondBarriers(ModuleOp m) {
         Operation *next = preLoopCB->getNextNode();
         if (next && isa<ROCDL::SetPrioOp>(next))
           next = next->getNextNode();
-        bool nextIsPipeline =
-            isa_and_nonnull<scf::ForOp>(next) ||
-            (isa_and_nonnull<scf::ExecuteRegionOp>(next) &&
-             next->hasAttr("triton.warp_pipeline.stage"));
+        bool nextIsPipeline = isa_and_nonnull<scf::ForOp>(next) ||
+                              (isa_and_nonnull<scf::ExecuteRegionOp>(next) &&
+                               next->hasAttr("triton.warp_pipeline.stage"));
         if (!nextIsPipeline)
           continue;
 
@@ -571,8 +567,8 @@ static void eliminateRedundantCondBarriers(ModuleOp m) {
         // Find the ttg.barrier local (pre-barrier) between the two
         // cond_barriers.
         triton::gpu::BarrierOp preBarrier = nullptr;
-        for (Operation *op = postLoopCB->getNextNode();
-             op && op != preLoopCB; op = op->getNextNode()) {
+        for (Operation *op = postLoopCB->getNextNode(); op && op != preLoopCB;
+             op = op->getNextNode()) {
           if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op)) {
             if (barrier.hasLocal()) {
               preBarrier = barrier;
@@ -635,7 +631,7 @@ struct ConvertWarpPipeline
     // Flat (unrolled) pipeline regions are still wrapped in execute_regions
     // with no_inline=true from WarpPipeliner.  Process them before inlining.
     processUnrolledPipelineRegions(m, moduleAllocation,
-                                  threadsPerPipelineGroup);
+                                   threadsPerPipelineGroup);
 
     // Must run after patternFor and flat processing (all regions converted,
     // barriers inserted) but before patternInline (inlining execute_regions
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index 2eb0f6b0c573..469895b64d68 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -223,8 +223,7 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
   // stage.  Stop at control-flow boundaries (scf.for, cond_barrier) or
   // ignorable ops that logically belong to a previous pipeline.
   Operation *regionStart = firstBorder;
-  for (Operation *op = firstBorder->getPrevNode(); op;
-       op = op->getPrevNode()) {
+  for (Operation *op = firstBorder->getPrevNode(); op; op = op->getPrevNode()) {
     if (isa<scf::ForOp>(op) || isa<tt::amdgpu::CondBarrierOp>(op))
       break;
     if (isIgnorable(op))
@@ -252,8 +251,7 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
       if (cluster.empty()) {
         b.setInsertionPoint(op);
         auto dummyOp = ROCDL::SchedBarrier::create(b, loc, 0);
-        dummyOp->setAttr("triton.warp_pipeline.empty_cluster",
-                         b.getUnitAttr());
+        dummyOp->setAttr("triton.warp_pipeline.empty_cluster", b.getUnitAttr());
         cluster.push_back(dummyOp);
       }
       clusters.push_back(std::move(cluster));
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
index cce84b23ddb3..b5651e82c386 100644
--- a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
+++ b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
@@ -18,7 +18,6 @@
         create_shared_layouts,
         create_tensor_descriptors,
         issue_loads,
-        issue_wmma,
         lds_load,
         issue_wmma_compute,
     )
@@ -28,7 +27,6 @@
         create_shared_layouts,
         create_tensor_descriptors,
         issue_loads,
-        issue_wmma,
         lds_load,
         issue_wmma_compute,
     )

From 482c64b4e79a850dbd1d177f4065f0cc423442e1 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Mon, 6 Apr 2026 21:19:44 +0000
Subject: [PATCH 06/19] Remove unnecessary for_loop_depth assertion from
 warp_pipeline_stage

---
 python/triton/compiler/code_generator.py                   | 7 -------
 python/triton/experimental/gluon/language/_semantic.py     | 4 ----
 .../experimental/gluon/language/amd/warp_pipeline.py       | 6 ------
 3 files changed, 17 deletions(-)

diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
index 4ed60506c7d9..bd98a1951e02 100644
--- a/python/triton/compiler/code_generator.py
+++ b/python/triton/compiler/code_generator.py
@@ -1162,8 +1162,6 @@ def visit_For(self, node):
         IteratorClass = self.visit(node.iter.func)
         iter_args = [self.visit(arg) for arg in node.iter.args]
         iter_kwargs = dict(self.visit(keyword) for keyword in node.iter.keywords)
-        if hasattr(self.semantic, 'for_loop_depth'):
-            self.semantic.for_loop_depth += 1
         if IteratorClass == language.static_range:
             iterator = IteratorClass(*iter_args, **iter_kwargs)
             static_range = range(iterator.start.value, iterator.end.value, iterator.step.value)
@@ -1172,8 +1170,6 @@ def visit_For(self, node):
                 self.visit_compound_statement(node.body)
                 for stmt in node.orelse:
                     ast.NodeVisitor.generic_visit(self, stmt)
-            if hasattr(self.semantic, 'for_loop_depth'):
-                self.semantic.for_loop_depth -= 1
             return
         num_stages = None
         loop_unroll_factor = None
@@ -1299,9 +1295,6 @@ def visit_For(self, node):
             assert False, "Don't know what to do with else after for"
             ast.NodeVisitor.generic_visit(self, stmt)
 
-        if hasattr(self.semantic, 'for_loop_depth'):
-            self.semantic.for_loop_depth -= 1
-
     def visit_Slice(self, node):
         lower = self.visit(node.lower)
         upper = self.visit(node.upper)
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
index c7add748eb82..d1fad3c0753e 100644
--- a/python/triton/experimental/gluon/language/_semantic.py
+++ b/python/triton/experimental/gluon/language/_semantic.py
@@ -103,10 +103,6 @@ class GluonSemantic(TritonSemantic[TensorTy]):
 
     def __init__(self, builder: GluonOpBuilder):
         self.builder = builder
-        # Tracks Python-level for-loop nesting depth (both scf.for and
-        # static_range).  Used by warp_pipeline_stage to enforce that
-        # pipeline stages are only declared inside a loop.
-        self.for_loop_depth = 0
 
     def _wrap_handle_infer_layout(self, handle, scalar_ty, shape):
         if shape == []:
diff --git a/python/triton/experimental/gluon/language/amd/warp_pipeline.py b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
index 63b6b3f5bce1..e0b132e4620c 100644
--- a/python/triton/experimental/gluon/language/amd/warp_pipeline.py
+++ b/python/triton/experimental/gluon/language/amd/warp_pipeline.py
@@ -67,12 +67,6 @@ def __exit__(self, exc_type, exc, tb):
             return False
         if self._semantic is None:
             return False
-        # Warp-pipelining is a loop optimization: stages must be declared
-        # inside a for loop (range or static_range).  Allowing stages outside
-        # a loop would produce border markers with no well-defined iteration
-        # structure, breaking the phase-shift/reconvergence contract.
-        assert getattr(self._semantic, 'for_loop_depth', 0) > 0, ("warp_pipeline_stage must be used inside a for loop "
-                                                                  "(range or static_range)")
         marker = self.label if self.label is not None else "cluster"
         prio = self.priority if self.priority is not None else -1
         self._semantic.builder.create_warp_pipeline_border(marker, prio)

From 0c94338cb5aa9e9558546500cc7e94fefa782e61 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Wed, 8 Apr 2026 21:36:43 +0000
Subject: [PATCH 07/19] Fix flat pipeline inserting redundant s_barrier when
 pre-existing barrier exists

emitPipelinedFlat unconditionally inserted a new cluster barrier
(s_barrier) at every stage boundary, ignoring pre-existing barrier
ops (e.g., async_wait) between execute_regions. This produced two
barriers at the same boundary.

Mirror the emitPipelinedFor logic: scan between consecutive stages
for existing barrier ops and wrap them with sched_barriers instead
of inserting a new one.
---
 .../amd/amd-convert-warp-pipeline.mlir        | 71 +++++++++++++++++++
 .../ConvertWarpPipeline.cpp                   | 29 ++++++--
 2 files changed, 96 insertions(+), 4 deletions(-)

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index c3f6c708c5bc..a13a6b786ef7 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -713,3 +713,74 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // Reconverge for flat pipeline is kept.
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
+
+// -----
+
+// ---- Flat pipeline with pre-existing barrier between stages ----
+//
+// When an async_wait (or similar barrier op) already exists between
+// flat pipeline stages, the pass should wrap it with sched_barriers
+// instead of inserting a redundant s_barrier.
+//
+// Stage layout: stage0 -- async_wait -- stage1 -- (nothing) -- stage2
+//
+// Expected between stage0 and stage1:
+//   sched_barrier + async_wait + sched_barrier   (wrapped, no s_barrier)
+// Expected between stage1 and stage2:
+//   sched_barrier + s_barrier + sched_barrier     (inserted, no async_wait)
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @flat_pipeline_existing_barrier(%ptr: !tt.ptr<f32>) {
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+    %v2 = arith.constant 2.0 : f32
+
+    scf.execute_region no_inline {
+      tt.store %ptr, %v0 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage0"}
+
+    amdg.async_wait {num_inst = 0 : i32}
+
+    scf.execute_region no_inline {
+      tt.store %ptr, %v1 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage1"}
+
+    scf.execute_region no_inline {
+      tt.store %ptr, %v2 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "stage2"}
+
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @flat_pipeline_existing_barrier
+// CHECK-NOT: no_inline
+//
+// Pre-barrier + phase shift.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+//
+// Stage 0 ops.
+// CHECK: tt.store
+//
+// Between stage 0 and 1: existing async_wait wrapped, no s_barrier.
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: amdg.async_wait
+// CHECK-NEXT: rocdl.sched.barrier
+// CHECK-NOT: rocdl.s.barrier
+// Stage 1 ops.
+// CHECK: tt.store
+//
+// Between stage 1 and 2: no pre-existing barrier, so s_barrier inserted.
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: rocdl.s.barrier
+// CHECK-NEXT: rocdl.sched.barrier
+// Stage 2 ops.
+// CHECK: tt.store
+//
+// Reconverge.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 41023d6f53f0..d4caf2a494a2 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -421,13 +421,34 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
 
   // 3. Materialize cluster barriers.
   //    Cluster 0 gets only its priority (inserted after cond_barrier above).
-  //    Clusters 1..N get priority + cluster barrier.
+  //    Clusters 1..N get priority + cluster barrier, unless a pre-existing
+  //    barrier op (e.g., async_wait) already exists between the clusters —
+  //    in that case, wrap it with sched_barriers instead of adding a new one.
   emitClusterPriority(b, loc, clusterOps[0], anyHasPriority);
 
   for (int i = 1; i < numClusters; i++) {
-    b.setInsertionPoint(clusterOps[i]);
-    emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
-    emitClusterBarrier(b, loc, /*needLocal=*/bars[i]);
+    Operation *existingBarrier = nullptr;
+    for (Operation *op = clusterOps[i - 1]->getNextNode();
+         op && op != clusterOps[i].getOperation(); op = op->getNextNode()) {
+      if (isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
+              triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
+              triton::amdgpu::AsyncTDMIntrinsicWait>(op)) {
+        existingBarrier = op;
+        break;
+      }
+    }
+
+    if (existingBarrier) {
+      b.setInsertionPoint(existingBarrier);
+      emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
+      ROCDL::SchedBarrier::create(b, loc, 0);
+      b.setInsertionPointAfter(existingBarrier);
+      ROCDL::SchedBarrier::create(b, loc, 0);
+    } else {
+      b.setInsertionPoint(clusterOps[i]);
+      emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
+      emitClusterBarrier(b, loc, /*needLocal=*/bars[i]);
+    }
   }
 
   // 4. Post-sequence reconverge.

From cd5898120de8d70eb4d70b809e2d0d6c685c016f Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Tue, 14 Apr 2026 01:58:32 +0000
Subject: [PATCH 08/19] Add cross-pipeline and adjacent-stage LDS dependency
 analysis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two changes to analyzePipelineDependencies and eliminateRedundantCondBarriers:

1. Adjacent-stage check: the inner loop previously started at distance 2
   (next = src + 2 + offset), so consecutive clusters sharing an LDS
   allocation never got a LOCAL barrier.  Add a preliminary loop that
   checks clusterInfo[src] against clusterInfo[src+1] and sets
   bars[src+1] when they intersect.  This prevents ModuleMembarAnalysis
   from inserting a redundant ttg.barrier local inside the pipeline.

2. Cross-pipeline analysis: when eliminating redundant cond_barriers
   between back-to-back pipelines, run analyzePipelineDependencies on
   the merged cluster sequence to verify no LDS hazard exists at the
   boundary.  If the boundary needs a barrier (adjacent or distance-2+),
   the optimization is skipped.

Lit tests:
- back_to_back_cross_dep_kept: shared-buffer RAW at boundary → kept
- back_to_back_no_dep_elimination: loop B has no LDS → eliminated
- back_to_back_dep_covered_elimination: 3-stage loop A with internal
  barrier covering the cross-pipeline dep → eliminated
- adjacent_stage_lds_dep: 3-stage pipeline verifying LOCAL barrier
  between adjacent stages with RAW dependency
---
 .../amd/amd-convert-warp-pipeline.mlir        | 362 ++++++++++++++++--
 .../ConvertWarpPipeline.cpp                   | 217 +++++++++--
 2 files changed, 512 insertions(+), 67 deletions(-)

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index a13a6b786ef7..1664f5339166 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -448,37 +448,35 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 
 // -----
 
-// ---- Back-to-back pipelined loops: redundant cond_barriers eliminated ----
+// ---- Back-to-back: cross-pipeline LDS dependency prevents elimination ----
 //
-// Both loops have local_load (read) in stage0 and local_store (write) in
-// stage1, creating a wrap-around ttg.barrier local.  The post-loop
-// cond_barrier of loop 1, the pre-barrier of loop 2, and the pre-loop
-// cond_barrier of loop 2 should all be eliminated because loop 1's
-// wrap-around barrier already includes a local fence.
+// Both loops access the same shared buffer (read + write).  When merged,
+// warp0 at b0 (reads smem) runs concurrently with warp1 at a1 (writes smem)
+// — a RAW hazard.  The boundary barriers must be kept.
 //
 // Expected:
 //   ttg.barrier local          (pre-barrier for loop 1)
 //   amdg.cond_barrier          (#1 phase shift for loop 1)
 //   scf.for { loop 1 }
-//   NO amdg.cond_barrier       (#2 eliminated)
-//   NO ttg.barrier local       (pre-barrier eliminated)
-//   NO amdg.cond_barrier       (#3 eliminated)
+//   amdg.cond_barrier          (#2 post-loop reconverge — kept)
+//   ttg.barrier local          (pre-barrier for loop 2 — kept)
+//   amdg.cond_barrier          (#3 phase shift for loop 2 — kept)
 //   scf.for { loop 2 }
-//   amdg.cond_barrier          (#4 reconverge for loop 2)
+//   amdg.cond_barrier          (#4 post-loop reconverge for loop 2)
 
 #b2b_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
 #b2b_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
 #b2b_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #b2b_smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func @back_to_back_elimination(
+  tt.func @back_to_back_cross_dep_kept(
       %lb: i32, %ub: i32, %step: i32,
       %acc: tensor<256x256xf32, #b2b_mma>,
       %ptr: tensor<256x64x!tt.ptr<f16>, #b2b_blocked>) {
 
     %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
 
-    // Loop 1: stage0 reads LDS, stage1 writes LDS → wrap-around is ttg.barrier local
+    // Loop 1: stage0 reads LDS, stage1 writes LDS
     %r1:2 = scf.for %i = %lb to %ub step %step
         iter_args(%a1 = %acc, %s1 = %smem)
         -> (tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>) : i32 {
@@ -497,7 +495,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
       scf.yield %a1, %st1 : tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>
     } {triton.warp_pipeline.pipelined_for}
 
-    // Loop 2: same structure (read + write LDS) so it is not optimized away
+    // Loop 2: same structure — reads+writes the SAME buffer → cross-pipeline RAW
     %r2:2 = scf.for %j = %lb to %ub step %step
         iter_args(%a2 = %r1#0, %s2 = %r1#1)
         -> (tensor<256x256xf32, #b2b_mma>, !ttg.memdesc<256x64xf16, #b2b_shared, #b2b_smem, mutable>) : i32 {
@@ -521,19 +519,21 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
   }
 }
 
-// CHECK-LABEL: tt.func @back_to_back_elimination
-// Pre-barrier and phase shift for loop 1 are kept.
+// CHECK-LABEL: tt.func @back_to_back_cross_dep_kept
+// Pre-barrier and phase shift for loop 1.
 // CHECK: ttg.barrier local
 // CHECK: amdg.cond_barrier
 // CHECK: scf.for
-// Wrap-around barrier inside loop 1 (local fence from LDS dependency).
+// Wrap-around barrier inside loop 1.
 // CHECK: ttg.barrier local
 // CHECK: scf.yield
-// Between the two loops: no cond_barriers, no ttg.barrier local.
-// CHECK-NOT: amdg.cond_barrier
-// CHECK-NOT: ttg.barrier local
+// Cross-pipeline LDS dependency (a1 writes smem, b0 reads smem) →
+// barriers between the two loops are KEPT.
+// CHECK: amdg.cond_barrier
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
 // CHECK: scf.for
-// Post-loop reconverge for loop 2 is kept.
+// Post-loop reconverge for loop 2.
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
 
@@ -627,13 +627,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 
 // ---- Back-to-back: pipelined scf.for + flat (unrolled) pipeline ----
 //
-// Loop 1 (scf.for) has local_load in stage0 and local_store in stage1,
-// sharing the same LDS allocation → the wrap-around barrier includes a
-// ttg.barrier local.  The flat pipeline follows immediately.
-//
-// The post-loop reconverge of loop 1, the pre-barrier, and the phase
-// shift of the flat pipeline should all be eliminated — same logic as
-// back-to-back scf.for loops.
+// Loop 1 (scf.for) followed immediately by a flat pipeline with no
+// intervening operations.  The post-loop reconverge, prelude barrier,
+// and phase shift are all eliminated — same logic as back-to-back
+// scf.for loops.
 //
 // Expected:
 //   ttg.barrier local          (pre-barrier for loop 1)
@@ -700,10 +697,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // CHECK: ttg.barrier local
 // CHECK: amdg.cond_barrier
 // CHECK: scf.for
-// Wrap-around barrier inside loop 1 (local fence from LDS dependency).
+// Wrap-around barrier inside loop 1.
 // CHECK: ttg.barrier local
 // CHECK: scf.yield
-// Between loop 1 and flat pipeline: no cond_barriers, no ttg.barrier local.
+// Between loop 1 and flat pipeline: no cond_barriers, no ttg.barrier local
+// (no intervening ops → phase carries over, prelude barrier redundant).
 // CHECK-NOT: amdg.cond_barrier
 // CHECK-NOT: ttg.barrier local
 // Flat pipeline stages (inlined after conversion).
@@ -784,3 +782,309 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // Reconverge.
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
+
+// -----
+
+// ---- Back-to-back: no cross-pipeline LDS dep → barriers eliminated ----
+//
+// Loop 1 reads+writes shared memory.  Loop 2 only does global ops (no LDS).
+// No cross-pipeline LDS dependency exists, so the boundary barriers are
+// safely eliminated and the phase carries over.
+//
+// Expected:
+//   ttg.barrier local          (pre-barrier for loop 1)
+//   amdg.cond_barrier          (#1 phase shift for loop 1)
+//   scf.for { loop 1 }
+//   NO amdg.cond_barrier       (eliminated)
+//   NO ttg.barrier local       (eliminated)
+//   NO amdg.cond_barrier       (eliminated)
+//   scf.for { loop 2 }
+//   amdg.cond_barrier          (#4 reconverge for loop 2)
+
+#b2bnd_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#b2bnd_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#b2bnd_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#b2bnd_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @back_to_back_no_dep_elimination(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x256xf32, #b2bnd_mma>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #b2bnd_blocked>,
+      %gptr: !tt.ptr<f32>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    // Loop 1: stage0 reads LDS, stage1 writes LDS
+    %r1:2 = scf.for %i = %lb to %ub step %step
+        iter_args(%a1 = %acc, %s1 = %smem)
+        -> (tensor<256x256xf32, #b2bnd_mma>, !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>) : i32 {
+      %ld1 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bnd_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %s1[0, 0] : !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2bnd_shared, #b2bnd_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #b2bnd_shared, #b2bnd_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bnd_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bnd_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      %st1 = scf.execute_region -> !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable> no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #b2bnd_blocked>
+        ttg.local_store %data, %s1 : tensor<256x64xf16, #b2bnd_blocked> -> !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>
+        scf.yield %s1 : !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>
+      } {triton.warp_pipeline.stage = "global_load_and_store"}
+
+      scf.yield %a1, %st1 : tensor<256x256xf32, #b2bnd_mma>, !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Loop 2: global-only ops — no LDS access at all
+    scf.for %j = %lb to %ub step %step : i32 {
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "global_store_0"}
+
+      scf.execute_region no_inline {
+        tt.store %gptr, %v1 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "global_store_1"}
+
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #b2bnd_shared, #b2bnd_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @back_to_back_no_dep_elimination
+// Pre-barrier and phase shift for loop 1.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Wrap-around barrier inside loop 1.
+// CHECK: ttg.barrier local
+// CHECK: scf.yield
+// No cross-pipeline LDS dep → barriers eliminated, phase carries over.
+// CHECK-NOT: amdg.cond_barrier
+// CHECK-NOT: ttg.barrier local
+// CHECK: scf.for
+// Post-loop reconverge for loop 2.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
+
+// -----
+
+// ---- Back-to-back: cross-pipeline dep covered by loop A's barrier ----
+//
+// Loop 1 has 3 stages: stage0 writes LDS, stage1 reads LDS, stage2 is
+// compute-only.  The circular dependency analysis places a LOCAL barrier
+// between stage1 and stage2 (covering the WAR from stage1 reading what
+// stage0 wrote).
+//
+// Loop 2 has 2 stages: stage0 reads the SAME LDS buffer, stage1 is
+// compute-only.  There IS a cross-pipeline dependency (loop1.stage0 writes
+// smem that loop2.stage0 reads), but it is already covered by loop 1's
+// barrier between stage1 and stage2.
+//
+// At the boundary with no barrier: warp0 runs b0, warp1 runs a2.
+// Since a2 has no LDS access and the LOCAL barrier before a2 already
+// flushed all prior LDS writes, b0's read is safe.
+//
+// Expected:
+//   ttg.barrier local          (pre-barrier for loop 1)
+//   amdg.cond_barrier          (phase shift for loop 1)
+//   scf.for { loop 1 — 3 stages }
+//   NO amdg.cond_barrier       (eliminated)
+//   NO ttg.barrier local       (eliminated)
+//   NO amdg.cond_barrier       (eliminated)
+//   scf.for { loop 2 — 2 stages }
+//   amdg.cond_barrier          (reconverge for loop 2)
+
+#b2bcov_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#b2bcov_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#b2bcov_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#b2bcov_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @back_to_back_dep_covered_elimination(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x256xf32, #b2bcov_mma>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #b2bcov_blocked>,
+      %gptr: !tt.ptr<f32>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+
+    // Loop 1: 3 stages
+    //   stage0: writes LDS (local_store)
+    //   stage1: reads LDS  (local_load) → RAW with stage0
+    //   stage2: compute-only (global store, no LDS)
+    // Circular analysis: barrier between stage1 and stage2 is LOCAL.
+    %r1:2 = scf.for %i = %lb to %ub step %step
+        iter_args(%a1 = %acc, %s1 = %smem)
+        -> (tensor<256x256xf32, #b2bcov_mma>, !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>) : i32 {
+      %st1 = scf.execute_region -> !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable> no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #b2bcov_blocked>
+        ttg.local_store %data, %s1 : tensor<256x64xf16, #b2bcov_blocked> -> !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+        scf.yield %s1 : !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+      } {triton.warp_pipeline.stage = "global_load_and_store"}
+
+      %ld1 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %s1[0, 0] : !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2bcov_shared, #b2bcov_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #b2bcov_shared, #b2bcov_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "compute"}
+
+      scf.yield %a1, %s1 : tensor<256x256xf32, #b2bcov_mma>, !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Loop 2: stage0 reads the SAME LDS buffer, stage1 is compute-only.
+    // Cross-pipeline dep (a0 writes → b0 reads) is covered by loop 1's
+    // barrier between stage1 and stage2.
+    %r2:2 = scf.for %j = %lb to %ub step %step
+        iter_args(%a2 = %r1#0, %s2 = %r1#1)
+        -> (tensor<256x256xf32, #b2bcov_mma>, !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>) : i32 {
+      %ld2 = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>> no_inline {
+        %sub2 = ttg.memdesc_subslice %s2[0, 0] : !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable> -> !ttg.memdesc<256x16xf16, #b2bcov_shared, #b2bcov_smem, mutable, 256x64>
+        %v2 = ttg.local_load %sub2 : !ttg.memdesc<256x16xf16, #b2bcov_shared, #b2bcov_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>>
+        scf.yield %v2 : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #b2bcov_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "epilogue_lds_load"}
+
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "epilogue_compute"}
+
+      scf.yield %a2, %s2 : tensor<256x256xf32, #b2bcov_mma>, !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #b2bcov_shared, #b2bcov_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @back_to_back_dep_covered_elimination
+// Pre-barrier and phase shift for loop 1.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Loop 1 has 3 stages; barrier between stage1→stage2 is LOCAL (covers dep).
+// CHECK: ttg.barrier local
+// CHECK: scf.yield
+// Cross-pipeline dep IS covered by loop 1's internal barrier →
+// boundary barriers eliminated, phase carries over.
+// CHECK-NOT: amdg.cond_barrier
+// CHECK-NOT: ttg.barrier local
+// CHECK: scf.for
+// Post-loop reconverge for loop 2.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
+
+// -----
+
+// ---- Adjacent-stage LDS dependency: barrier must be LOCAL ----
+//
+// 3-stage loop pipeline where stage0 writes LDS and stage1 reads it.
+// Stage2 has no LDS access.
+//
+// The distance-2+ analysis only checks pairs separated by ≥2 clusters,
+// so it never examines (stage0, stage1) directly.  Without the adjacent-
+// stage check, the barrier between stage0 and stage1 would be emitted as
+// a plain s_barrier, and ModuleMembarAnalysis would later insert a
+// redundant ttg.barrier local inside the pipeline — breaking timing.
+//
+// With the adjacent-stage check:
+//   bars[0] (wrap-around) = false  (a2 no LDS, a0 writes — no conflict)
+//   bars[1] (a0→a1)       = true   (a0 writes, a1 reads — RAW)
+//   bars[2] (a1→a2)       = true   (a1→a0 WAR via distance-2)
+//
+// Expected inside the loop body:
+//   stage0 ops  (local_store)
+//   ttg.barrier local             (bars[1] — adjacent dep)
+//   stage1 ops  (local_load)
+//   ttg.barrier local             (bars[2] — distance-2 dep)
+//   stage2 ops  (global store)
+//   rocdl.s.barrier               (bars[0] — wrap-around, no LDS dep)
+//   scf.yield
+
+#adj_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#adj_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#adj_dot = #ttg.dot_op<{opIdx = 0, parent = #adj_mma, kWidth = 4}>
+#adj_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#adj_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @adjacent_stage_lds_dep(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x16xf16, #adj_dot>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #adj_blocked>,
+      %gptr: !tt.ptr<f32>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+
+    // The local_load result must be carried as an iter_arg so it is not
+    // DCE'd — otherwise the barrier between stage0 and stage1 would merge
+    // with the barrier between stage1 and stage2.
+    %r:3 = scf.for %i = %lb to %ub step %step
+        iter_args(%a = %acc, %s = %smem, %prev = %acc)
+        -> (tensor<256x16xf16, #adj_dot>, !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>, tensor<256x16xf16, #adj_dot>) : i32 {
+
+      // Stage 0: writes LDS
+      %st = scf.execute_region -> !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable> no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #adj_blocked>
+        ttg.local_store %data, %s : tensor<256x64xf16, #adj_blocked> -> !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>
+        scf.yield %s : !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>
+      } {triton.warp_pipeline.stage = "global_load_and_store"}
+
+      // Stage 1: reads LDS — RAW dep with stage 0
+      %ld = scf.execute_region -> tensor<256x16xf16, #adj_dot> no_inline {
+        %sub = ttg.memdesc_subslice %s[0, 0] : !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable> -> !ttg.memdesc<256x16xf16, #adj_shared, #adj_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #adj_shared, #adj_smem, mutable, 256x64> -> tensor<256x16xf16, #adj_dot>
+        scf.yield %v : tensor<256x16xf16, #adj_dot>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      // Stage 2: compute-only — no LDS access
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "compute"}
+
+      scf.yield %a, %s, %ld : tensor<256x16xf16, #adj_dot>, !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>, tensor<256x16xf16, #adj_dot>
+    } {triton.warp_pipeline.pipelined_for}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #adj_shared, #adj_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @adjacent_stage_lds_dep
+// CHECK: scf.for
+//
+// Stage 0 ops (local_store).
+// CHECK: ttg.local_store
+//
+// Barrier between stage0→stage1 is LOCAL (adjacent RAW: write→read).
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: ttg.barrier local
+// CHECK-NEXT: rocdl.sched.barrier
+//
+// Stage 1 ops (local_load).
+// CHECK: ttg.local_load
+//
+// Barrier between stage1→stage2 is LOCAL (distance-2 WAR: a1 reads, a0 writes).
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: ttg.barrier local
+// CHECK-NEXT: rocdl.sched.barrier
+//
+// Stage 2 ops (global store).
+// CHECK: tt.store
+//
+// Wrap-around barrier is s_barrier only (a2 has no LDS, a0 writes — no dep).
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: rocdl.s.barrier
+// CHECK-NEXT: rocdl.sched.barrier
+//
+// CHECK: scf.yield
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index d4caf2a494a2..20a69a48cbb5 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -91,6 +91,30 @@ static void analyzePipelineDependencies(ArrayRef<BlockInfo> clusterInfo,
                                         SmallVectorImpl<bool> &bars,
                                         Allocation *allocation, bool circular) {
   int numClusters = clusterInfo.size();
+
+  // Adjacent-stage check (distance 1).  Every pair of consecutive clusters
+  // is separated by a barrier slot.  When adjacent stages share an LDS
+  // allocation the barrier must be LOCAL (ds_wait + s_barrier) so that
+  // ModuleMembarAnalysis — which runs later and does not understand warp
+  // pipeline phases — will not insert a redundant ttg.barrier local inside
+  // the pipelined region.
+  int adjEnd = circular ? numClusters : numClusters - 1;
+  for (int src = 0; src < adjEnd; src++) {
+    int next = circular ? (src + 1) % numClusters : src + 1;
+    int barrierLoc = next; // barrier sits right before `next`
+    if (bars[barrierLoc])
+      continue;
+    if (clusterInfo[src].isIntersected(
+            clusterInfo[next], mlir::triton::AMD::membarFilter, allocation)) {
+      bars[barrierLoc] = true;
+      LDBG("adjacent cluster " << src << " need fence to " << next
+                               << " placing barrier at " << barrierLoc);
+    }
+  }
+
+  // Distance-2+ check.  For each (src, next) pair separated by at least one
+  // intermediate barrier position, verify the dependency is already covered
+  // by an intervening LOCAL barrier; if not, mark the closest position.
   for (int offset = 0; offset < numClusters; offset++) {
     for (int src = 0; src < numClusters; src++) {
       int next, barrierLoc;
@@ -158,6 +182,10 @@ static void emitClusterPriority(OpBuilder &r, Location loc,
 // Returns warpLow (for reconverge) and warpHigh (consumed by phase shift).
 static std::pair<Value, Value>
 emitPipelinePrelude(OpBuilder &b, Location loc, int threadsPerPipelineGroup) {
+  // Flush any pending shared-memory (LDS) dependencies before entering the
+  // warp-pipelined region.  Without this barrier ModuleMembarAnalysis may
+  // later insert a barrier inside the first pipeline stage, which would
+  // break the carefully tuned pipeline timing.
   mlir::triton::gpu::BarrierOp::create(b, loc, triton::gpu::AddrSpace::Local);
 
   auto i32ty = b.getIntegerType(32);
@@ -507,32 +535,135 @@ static void processUnrolledPipelineRegions(ModuleOp m,
   });
 }
 
-// Check if the wrap-around cluster barrier of a converted pipelined loop
-// includes a local memory fence (ttg.barrier local).  The wrap-around barrier
-// is the last cluster barrier emitted just before the scf.yield terminator:
-//   [s_setprio]  sched_barrier  ttg.barrier_local|s_barrier  sched_barrier
-//   yield
-static bool hasLocalFenceAtWrapAround(scf::ForOp forOp) {
-  auto *yieldOp = forOp.getBody()->getTerminator();
+// Collect execute_region clusters and their materialized barrier flags from
+// a converted pipelined for-loop body.  After ConvertPipelinedForPattern the
+// loop body contains: [priority] [barrier] execute_region ... [barrier] yield.
+// Returns false if the loop body doesn't match the expected pattern.
+static bool collectLoopClusters(scf::ForOp forOp,
+                                SmallVectorImpl<Block *> &blocks,
+                                SmallVectorImpl<bool> &bars) {
+  Operation *yieldOp = forOp.getBody()->getTerminator();
   if (!yieldOp)
     return false;
-  Operation *op = yieldOp->getPrevNode();
-  if (!op || !isa<ROCDL::SchedBarrier>(op))
+  for (auto &op : *forOp.getBody()) {
+    if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op)) {
+      if (!exec->hasAttr("triton.warp_pipeline.stage"))
+        continue;
+      blocks.push_back(&exec->getRegion(0).front());
+      bars.push_back(false);
+    }
+  }
+  if (blocks.empty())
     return false;
-  op = op->getPrevNode();
-  if (!op)
+
+  int K = blocks.size();
+  // Walk backwards from yield to find the wrap-around barrier (barrier[0]).
+  // Pattern: [s_setprio] sched_barrier (ttg.barrier_local|s_barrier)
+  // sched_barrier yield
+  Operation *op = yieldOp->getPrevNode();
+  if (op && isa<ROCDL::SchedBarrier>(op)) {
+    op = op->getPrevNode();
+    if (auto barrier = dyn_cast_or_null<triton::gpu::BarrierOp>(op))
+      bars[0] = barrier.hasLocal();
+  }
+
+  // Walk the loop body to find barriers between clusters (barrier[1..K-1]).
+  // After materialization each barrier sits just before its cluster's
+  // execute_region: [s_setprio] sched_barrier (barrier) sched_barrier exec.
+  for (int i = 1; i < K; i++) {
+    auto *exec = blocks[i]->getParentOp();
+    for (Operation *scan = exec->getPrevNode(); scan;
+         scan = scan->getPrevNode()) {
+      if (isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp>(scan))
+        continue;
+      if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(scan))
+        bars[i] = barrier.hasLocal();
+      break;
+    }
+  }
+  return true;
+}
+
+// Collect execute_region clusters from the next pipeline (flat or loop).
+// For a flat pipeline: collect the contiguous execute_regions starting at
+// `startOp`.  For a loop: collect from the for-loop body.
+static bool collectNextPipelineClusters(Operation *startOp,
+                                        SmallVectorImpl<Block *> &blocks) {
+  if (auto forOp = dyn_cast<scf::ForOp>(startOp)) {
+    for (auto &op : *forOp.getBody())
+      if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op))
+        if (exec->hasAttr("triton.warp_pipeline.stage"))
+          blocks.push_back(&exec->getRegion(0).front());
+  } else if (auto exec = dyn_cast<scf::ExecuteRegionOp>(startOp)) {
+    blocks.push_back(&exec->getRegion(0).front());
+    for (Operation *op = startOp->getNextNode(); op; op = op->getNextNode()) {
+      auto nextExec = dyn_cast<scf::ExecuteRegionOp>(op);
+      if (nextExec && nextExec->hasAttr("triton.warp_pipeline.stage"))
+        blocks.push_back(&nextExec->getRegion(0).front());
+      else
+        break;
+    }
+  }
+  return !blocks.empty();
+}
+
+// Check whether merging two pipelines creates a cross-pipeline LDS dependency
+// at the boundary.  Concatenates the cluster infos and barrier flags from both
+// pipelines (with `false` at the boundary) and runs analyzePipelineDependencies
+// on the merged linear sequence.  The analysis now covers both adjacent
+// (distance-1) and longer-range pairs, so a single call is sufficient.
+//
+// Returns true if the boundary position is dependency-free and can be
+// eliminated.
+static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
+                                ArrayRef<bool> loopBars,
+                                ArrayRef<Block *> nextBlocks,
+                                Allocation *allocation) {
+  int K = loopBlocks.size();
+  int M = nextBlocks.size();
+
+  SmallVector<BlockInfo> mergedInfo;
+  for (auto *b : loopBlocks)
+    mergedInfo.push_back(buildBlockInfoFromBlock(b, allocation));
+  for (auto *b : nextBlocks)
+    mergedInfo.push_back(buildBlockInfoFromBlock(b, allocation));
+
+  // Merged barrier flags: [loopBars..., false (boundary), false...]
+  // The boundary position is at index K.
+  SmallVector<bool> mergedBars;
+  for (bool b : loopBars)
+    mergedBars.push_back(b);
+  mergedBars.push_back(false); // boundary position
+  for (int i = 1; i < M; i++)
+    mergedBars.push_back(false);
+
+  analyzePipelineDependencies(mergedInfo, mergedBars, allocation,
+                              /*circular=*/false);
+
+  if (mergedBars[K]) {
+    LDBG("cross-pipeline LDS dependency at boundary");
     return false;
-  if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op))
-    return barrier.hasLocal();
-  return false;
+  }
+  return true;
 }
 
 // Eliminate redundant conditional barriers between consecutive warp-pipelined
-// regions.  When loop 1's wrap-around barrier already includes a local fence,
-// the phase shift naturally carries over into the next pipeline: the post-loop
-// reconverge and pre-pipeline phase shift cancel, and the intervening
-// pre-barrier is redundant because membar will not need to insert a barrier
-// (the wrap-around fence already resolved all pending LDS writes).
+// regions.  When two pipelines are back-to-back with no intervening
+// operations, the post-loop reconverge (cond_barrier warpLow) and the
+// pre-pipeline phase shift (cond_barrier warpHigh) cancel out — the phase
+// from the first pipeline naturally carries over.
+//
+// The prelude's ttg.barrier local (see emitPipelinePrelude) exists to flush
+// pending LDS state so ModuleMembarAnalysis won't insert barriers inside
+// pipeline stages.  When the post-loop cond_barrier is immediately followed
+// by this barrier and cross-pipeline dependency analysis confirms no LDS
+// hazard at the boundary, the barrier is also redundant.
+//
+// When the two pipelines merge, the phase offset causes stages from different
+// pipelines to execute concurrently (e.g., warp0 runs b0 while warp1 runs
+// a_{K-1}).  The cross-pipeline analysis checks all pairs (a_i, b_j) for LDS
+// conflicts, accounting for barriers already placed by each pipeline's own
+// dependency analysis.
 //
 // The "next pipeline" can be either another scf.for or a flat (unrolled)
 // pipeline represented as a sequence of scf.execute_region ops.
@@ -547,10 +678,15 @@ static bool hasLocalFenceAtWrapAround(scf::ForOp forOp) {
 //   [s_setprio P]
 //   scf.for / execute_region { pipeline 2 }
 //
-static void eliminateRedundantCondBarriers(ModuleOp m) {
+static void eliminateRedundantCondBarriers(ModuleOp m,
+                                           ModuleAllocation &moduleAllocation) {
   SmallVector<Operation *> toErase;
 
   m.walk([&](triton::FuncOp funcOp) {
+    Allocation *allocation = moduleAllocation.getFuncData(funcOp);
+    if (!allocation)
+      return;
+
     for (Block &block : funcOp.getBody()) {
       SmallVector<triton::amdgpu::CondBarrierOp> condBarriers;
       for (auto &op : block)
@@ -566,9 +702,9 @@ static void eliminateRedundantCondBarriers(ModuleOp m) {
         Operation *prev = postLoopCB->getPrevNode();
         if (prev && isa<ROCDL::SetPrioOp>(prev))
           prev = prev->getPrevNode();
-        auto prevFor = dyn_cast_or_null<scf::ForOp>(prev);
-        if (!prevFor)
+        if (!isa_and_nonnull<scf::ForOp>(prev))
           continue;
+        auto prevFor = cast<scf::ForOp>(prev);
 
         // The pre-loop cond_barrier must be followed by a warp-pipelined
         // scf.for or a flat pipeline execute_region (possibly with an
@@ -582,25 +718,30 @@ static void eliminateRedundantCondBarriers(ModuleOp m) {
         if (!nextIsPipeline)
           continue;
 
-        if (!hasLocalFenceAtWrapAround(prevFor))
+        // The post-loop cond_barrier must be immediately followed by the
+        // prelude's ttg.barrier local — this proves no operations were
+        // inserted between the two pipelines.
+        auto preBarrier =
+            dyn_cast_or_null<triton::gpu::BarrierOp>(postLoopCB->getNextNode());
+        if (!preBarrier || !preBarrier.hasLocal())
           continue;
 
-        // Find the ttg.barrier local (pre-barrier) between the two
-        // cond_barriers.
-        triton::gpu::BarrierOp preBarrier = nullptr;
-        for (Operation *op = postLoopCB->getNextNode(); op && op != preLoopCB;
-             op = op->getNextNode()) {
-          if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(op)) {
-            if (barrier.hasLocal()) {
-              preBarrier = barrier;
-              break;
-            }
-          }
-        }
-        if (!preBarrier)
+        // Cross-pipeline LDS dependency analysis.  When the phase carries
+        // over, stages from different pipelines execute concurrently at the
+        // boundary.  We must verify that no uncovered LDS conflict exists.
+        SmallVector<Block *> loopBlocks, nextBlocks;
+        SmallVector<bool> loopBars;
+        if (!collectLoopClusters(prevFor, loopBlocks, loopBars))
+          continue;
+        if (!collectNextPipelineClusters(next, nextBlocks))
           continue;
+        if (!isCrossPipelineSafe(loopBlocks, loopBars, nextBlocks,
+                                 allocation)) {
+          LDBG("cross-pipeline LDS dependency at boundary — keeping barriers");
+          continue;
+        }
 
-        LDBG("eliminating redundant barriers between back-to-back loops");
+        LDBG("eliminating redundant barriers between back-to-back pipelines");
         toErase.push_back(postLoopCB);
         toErase.push_back(preBarrier);
         toErase.push_back(preLoopCB);
@@ -657,7 +798,7 @@ struct ConvertWarpPipeline
     // Must run after patternFor and flat processing (all regions converted,
     // barriers inserted) but before patternInline (inlining execute_regions
     // would flatten the IR and obscure the cond_barrier adjacency we rely on).
-    eliminateRedundantCondBarriers(m);
+    eliminateRedundantCondBarriers(m, moduleAllocation);
 
     if (failed(applyPatternsGreedily(m, std::move(patternInline))))
       signalPassFailure();

From 58f429df1b5fa4c82f41cf8f7eb9ee0329f2fa2f Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sat, 18 Apr 2026 14:14:54 +0000
Subject: [PATCH 09/19] Collect all flat-pipeline stages for cross-pipeline dep
 analysis

collectNextPipelineClusters stopped at the first intervening
sched_barrier / cluster barrier for flat pipelines, so only b_0 was ever
visible to isCrossPipelineSafe.  A cross-pipeline LDS dep involving a
later flat stage (b_1, b_2, ...) was missed and the boundary cond_barrier
/ prelude ttg.barrier local / phase-shift cond_barrier triplet could be
wrongly eliminated.

Split the collection into collectLoopClusters / collectFlatClusters and
walk past intra-pipeline glue (sched_barrier, s_setprio, cluster
barriers, pre-existing async waits) so every flat stage is collected.
Also thread B's materialized barrier flags into isCrossPipelineSafe so
the merged analysis sees B's actual internal LOCAL barriers instead of
relying on re-discovery from all-false placeholders.

Add a lit test (@cross_pipeline_dep_in_b1) that fails without the fix.
---
 .../amd/amd-convert-warp-pipeline.mlir        |  96 +++++++++++
 .../ConvertWarpPipeline.cpp                   | 149 ++++++++++++------
 2 files changed, 199 insertions(+), 46 deletions(-)

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index 1664f5339166..8923762dc8e2 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -1088,3 +1088,99 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // CHECK-NEXT: rocdl.sched.barrier
 //
 // CHECK: scf.yield
+
+// -----
+
+// ---- Back-to-back: cross-pipeline dep in a later flat stage (b_1) ----
+//
+// This test exercises `collectNextPipelineClusters` when the next pipeline is
+// a flat (unrolled) sequence of more than one stage.  Before the fix, only
+// the first stage (b_0) was collected, so a cross-pipeline dependency
+// involving a later stage (b_1, b_2, …) was missed and the boundary barriers
+// were wrongly eliminated.
+//
+// Layout:
+//   Loop A (2 stages): a_0 tt.store          (no LDS)
+//                      a_1 ttg.local_store   (WRITES LDS)
+//   Flat B (2 stages): b_0 tt.store          (no LDS)
+//                      b_1 ttg.local_store   (WRITES the same LDS buffer)
+//
+// A's circular analysis places its single LOCAL barrier at bars[0] (the
+// wrap-around) because the only intersecting pair is (a_1, a_1) via the
+// distance-2 self-check.  Its internal barrier between stages (bars[1]) is
+// therefore s_barrier and does NOT cover the cross-pipeline dep.
+//
+// Cross-pipeline dep: (a_1, b_1) WAW at merged distance 2, barrierLoc = K = 2
+// (the boundary).  With all other barrier slots non-LOCAL in the merged
+// sequence, the analysis must flag the boundary and preserve the post-loop
+// cond_barrier, prelude ttg.barrier local, and phase-shift cond_barrier.
+//
+// Before the fix the boundary barriers would have been removed (false
+// negative) because only b_0 was collected, making b_1 invisible to the
+// cross-pipeline analysis.
+
+#crossb_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#crossb_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#crossb_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @cross_pipeline_dep_in_b1(
+      %lb: i32, %ub: i32, %step: i32,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #crossb_blocked>,
+      %gptr: !tt.ptr<f32>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    // Loop A: stage 0 no LDS, stage 1 writes %smem.
+    scf.for %i = %lb to %ub step %step : i32 {
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "a_compute"}
+
+      scf.execute_region no_inline {
+        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #crossb_blocked>
+        ttg.local_store %data, %smem : tensor<256x64xf16, #crossb_blocked> -> !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
+        scf.yield
+      } {triton.warp_pipeline.stage = "a_store"}
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Flat B: b_0 no LDS (masks the bug), b_1 writes the same %smem (dep).
+    scf.execute_region no_inline {
+      tt.store %gptr, %v1 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "b_nolds"}
+
+    scf.execute_region no_inline {
+      %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #crossb_blocked>
+      ttg.local_store %data, %smem : tensor<256x64xf16, #crossb_blocked> -> !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
+      scf.yield
+    } {triton.warp_pipeline.stage = "b_lds"}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @cross_pipeline_dep_in_b1
+// Pre-barrier and phase shift for loop A.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Loop body: a_0 (tt.store), internal s_barrier, a_1 (local_store), wrap-around LOCAL.
+// CHECK: tt.store
+// CHECK: ttg.local_store
+// CHECK: ttg.barrier local
+// Boundary barriers between loop A and flat B are KEPT because (a_1, b_1)
+// is a cross-pipeline WAW dep on %smem and A's internal barriers do NOT
+// cover the path a_1 → boundary → b_0 → b_1.
+// CHECK: amdg.cond_barrier
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// Flat B stages: b_0 (tt.store), internal s_barrier, b_1 (local_store).
+// CHECK: tt.store
+// CHECK: ttg.local_store
+// Reconverge cond_barrier for flat B.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 20a69a48cbb5..a8d3a89ccb3f 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -535,9 +535,38 @@ static void processUnrolledPipelineRegions(ModuleOp m,
   });
 }
 
+// Return true if `op` is intra-pipeline glue between two clusters — the
+// sequence emitted by emitClusterBarrier/emitClusterPriority and any
+// pre-existing barrier op that emitPipelinedFlat wraps with sched_barriers.
+static bool isIntraPipelineGlue(Operation *op) {
+  return isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp, ROCDL::SBarrierOp,
+             ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::BarrierOp,
+             triton::gpu::AsyncWaitOp, triton::amdgpu::AsyncWaitOp,
+             triton::amdgpu::AsyncTDMWait,
+             triton::amdgpu::AsyncTDMIntrinsicWait>(op);
+}
+
+// Walk backward from `exec` past `sched_barrier` / `s_setprio` and check
+// whether the first non-glue op is a LOCAL `triton::gpu::BarrierOp`.
+// Any other barrier kind (s_barrier, async_wait, …) is treated as
+// non-LOCAL for the purposes of LDS-dependency coverage.
+static bool hasLocalBarrierBefore(Operation *exec) {
+  for (Operation *scan = exec->getPrevNode(); scan;
+       scan = scan->getPrevNode()) {
+    if (isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp>(scan))
+      continue;
+    if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(scan))
+      return barrier.hasLocal();
+    return false;
+  }
+  return false;
+}
+
 // Collect execute_region clusters and their materialized barrier flags from
 // a converted pipelined for-loop body.  After ConvertPipelinedForPattern the
 // loop body contains: [priority] [barrier] execute_region ... [barrier] yield.
+// bars[0] corresponds to the wrap-around barrier (before yield); bars[i] for
+// i > 0 is the barrier immediately preceding cluster i.
 // Returns false if the loop body doesn't match the expected pattern.
 static bool collectLoopClusters(scf::ForOp forOp,
                                 SmallVectorImpl<Block *> &blocks,
@@ -557,9 +586,9 @@ static bool collectLoopClusters(scf::ForOp forOp,
     return false;
 
   int K = blocks.size();
-  // Walk backwards from yield to find the wrap-around barrier (barrier[0]).
+  // bars[0]: wrap-around barrier immediately before the yield.
   // Pattern: [s_setprio] sched_barrier (ttg.barrier_local|s_barrier)
-  // sched_barrier yield
+  //          sched_barrier yield
   Operation *op = yieldOp->getPrevNode();
   if (op && isa<ROCDL::SchedBarrier>(op)) {
     op = op->getPrevNode();
@@ -567,57 +596,76 @@ static bool collectLoopClusters(scf::ForOp forOp,
       bars[0] = barrier.hasLocal();
   }
 
-  // Walk the loop body to find barriers between clusters (barrier[1..K-1]).
-  // After materialization each barrier sits just before its cluster's
-  // execute_region: [s_setprio] sched_barrier (barrier) sched_barrier exec.
-  for (int i = 1; i < K; i++) {
-    auto *exec = blocks[i]->getParentOp();
-    for (Operation *scan = exec->getPrevNode(); scan;
-         scan = scan->getPrevNode()) {
-      if (isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp>(scan))
-        continue;
-      if (auto barrier = dyn_cast<triton::gpu::BarrierOp>(scan))
-        bars[i] = barrier.hasLocal();
-      break;
+  // bars[1..K-1]: barrier immediately preceding each cluster's execute_region.
+  for (int i = 1; i < K; i++)
+    bars[i] = hasLocalBarrierBefore(blocks[i]->getParentOp());
+  return true;
+}
+
+// Collect execute_region clusters and their preceding barrier flags from a
+// flat (unrolled) pipeline starting at `firstExec`.  After emitPipelinedFlat
+// the sequence looks like:
+//   exec { b_0 } [s_setprio] sched_barrier (barrier) sched_barrier exec { b_1 } ...
+// bars[0] is always false (no barrier before the first cluster); bars[i] for
+// i > 0 is the barrier between b_{i-1} and b_i.
+static bool collectFlatClusters(scf::ExecuteRegionOp firstExec,
+                                SmallVectorImpl<Block *> &blocks,
+                                SmallVectorImpl<bool> &bars) {
+  if (!firstExec->hasAttr("triton.warp_pipeline.stage"))
+    return false;
+  blocks.push_back(&firstExec->getRegion(0).front());
+  bars.push_back(false);
+
+  for (Operation *op = firstExec->getNextNode(); op; op = op->getNextNode()) {
+    if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op);
+        exec && exec->hasAttr("triton.warp_pipeline.stage")) {
+      blocks.push_back(&exec->getRegion(0).front());
+      bars.push_back(hasLocalBarrierBefore(op));
+      continue;
     }
+    // Walk past cluster barriers / priority / pre-existing barriers that
+    // emitPipelinedFlat may have wrapped with sched_barriers.  Anything
+    // else (e.g. cond_barrier postlude, unrelated ops) terminates the
+    // flat sequence.
+    if (isIntraPipelineGlue(op))
+      continue;
+    break;
   }
   return true;
 }
 
-// Collect execute_region clusters from the next pipeline (flat or loop).
-// For a flat pipeline: collect the contiguous execute_regions starting at
-// `startOp`.  For a loop: collect from the for-loop body.
+// Dispatch to collectLoopClusters / collectFlatClusters based on the kind of
+// the next pipeline.  The resulting bars follow the same convention as
+// collectLoopClusters: bars[0] is either a wrap-around (loop) or false (flat);
+// bars[i>0] is the barrier preceding cluster i.
 static bool collectNextPipelineClusters(Operation *startOp,
-                                        SmallVectorImpl<Block *> &blocks) {
-  if (auto forOp = dyn_cast<scf::ForOp>(startOp)) {
-    for (auto &op : *forOp.getBody())
-      if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op))
-        if (exec->hasAttr("triton.warp_pipeline.stage"))
-          blocks.push_back(&exec->getRegion(0).front());
-  } else if (auto exec = dyn_cast<scf::ExecuteRegionOp>(startOp)) {
-    blocks.push_back(&exec->getRegion(0).front());
-    for (Operation *op = startOp->getNextNode(); op; op = op->getNextNode()) {
-      auto nextExec = dyn_cast<scf::ExecuteRegionOp>(op);
-      if (nextExec && nextExec->hasAttr("triton.warp_pipeline.stage"))
-        blocks.push_back(&nextExec->getRegion(0).front());
-      else
-        break;
-    }
-  }
-  return !blocks.empty();
+                                        SmallVectorImpl<Block *> &blocks,
+                                        SmallVectorImpl<bool> &bars) {
+  if (auto forOp = dyn_cast<scf::ForOp>(startOp))
+    return collectLoopClusters(forOp, blocks, bars);
+  if (auto exec = dyn_cast<scf::ExecuteRegionOp>(startOp))
+    return collectFlatClusters(exec, blocks, bars);
+  return false;
 }
 
 // Check whether merging two pipelines creates a cross-pipeline LDS dependency
 // at the boundary.  Concatenates the cluster infos and barrier flags from both
-// pipelines (with `false` at the boundary) and runs analyzePipelineDependencies
-// on the merged linear sequence.  The analysis now covers both adjacent
-// (distance-1) and longer-range pairs, so a single call is sufficient.
+// pipelines and runs analyzePipelineDependencies on the merged linear
+// sequence.
+//
+// Note on concurrency vs memory ordering: with a one-stage phase offset, the
+// only cross-warp concurrent pair at the boundary is (a_{K-1}, b_0) — all
+// other pairs execute sequentially within the same warp.  However, within a
+// warp, LDS write→read ordering still requires a LOCAL barrier (ds_wait)
+// between producer and consumer.  The distance-2+ check verifies that such a
+// barrier exists somewhere along the path.
 //
-// Returns true if the boundary position is dependency-free and can be
-// eliminated.
+// Returns true if the boundary position stays dependency-free after analysis
+// (i.e. safe to eliminate).
 static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
                                 ArrayRef<bool> loopBars,
                                 ArrayRef<Block *> nextBlocks,
+                                ArrayRef<bool> nextBars,
                                 Allocation *allocation) {
   int K = loopBlocks.size();
   int M = nextBlocks.size();
@@ -628,14 +676,23 @@ static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
   for (auto *b : nextBlocks)
     mergedInfo.push_back(buildBlockInfoFromBlock(b, allocation));
 
-  // Merged barrier flags: [loopBars..., false (boundary), false...]
-  // The boundary position is at index K.
+  // Merged layout: [a_0..a_{K-1}, b_0..b_{M-1}]
+  // mergedBars[i] = LOCAL barrier immediately before cluster i.
+  //   i < K     : A's internal barriers (loopBars[i]).  loopBars[0]
+  //               corresponds to A's wrap-around inside the loop body and is
+  //               never consulted by the linear distance checks (src ≥ 0).
+  //   i == K    : boundary — initialized false; this is what we decide.
+  //   i > K     : B's internal barriers (nextBars[i - K]).  nextBars[0] is
+  //               skipped: for flat B it is always false, for loop B it is
+  //               B's own wrap-around (inside B's loop body) which is
+  //               covered by B's own circular analysis.
   SmallVector<bool> mergedBars;
+  mergedBars.reserve(K + M);
   for (bool b : loopBars)
     mergedBars.push_back(b);
-  mergedBars.push_back(false); // boundary position
+  mergedBars.push_back(false); // boundary
   for (int i = 1; i < M; i++)
-    mergedBars.push_back(false);
+    mergedBars.push_back(nextBars[i]);
 
   analyzePipelineDependencies(mergedInfo, mergedBars, allocation,
                               /*circular=*/false);
@@ -730,12 +787,12 @@ static void eliminateRedundantCondBarriers(ModuleOp m,
         // over, stages from different pipelines execute concurrently at the
         // boundary.  We must verify that no uncovered LDS conflict exists.
         SmallVector<Block *> loopBlocks, nextBlocks;
-        SmallVector<bool> loopBars;
+        SmallVector<bool> loopBars, nextBars;
         if (!collectLoopClusters(prevFor, loopBlocks, loopBars))
           continue;
-        if (!collectNextPipelineClusters(next, nextBlocks))
+        if (!collectNextPipelineClusters(next, nextBlocks, nextBars))
           continue;
-        if (!isCrossPipelineSafe(loopBlocks, loopBars, nextBlocks,
+        if (!isCrossPipelineSafe(loopBlocks, loopBars, nextBlocks, nextBars,
                                  allocation)) {
           LDBG("cross-pipeline LDS dependency at boundary — keeping barriers");
           continue;

From f4876776484bd01f2ae6219fb425b6f7dfa78e9d Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sat, 18 Apr 2026 14:36:13 +0000
Subject: [PATCH 10/19] format

---
 third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index a8d3a89ccb3f..51c4157eab21 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -605,7 +605,8 @@ static bool collectLoopClusters(scf::ForOp forOp,
 // Collect execute_region clusters and their preceding barrier flags from a
 // flat (unrolled) pipeline starting at `firstExec`.  After emitPipelinedFlat
 // the sequence looks like:
-//   exec { b_0 } [s_setprio] sched_barrier (barrier) sched_barrier exec { b_1 } ...
+//   exec { b_0 } [s_setprio] sched_barrier (barrier) sched_barrier exec { b_1 }
+//   ...
 // bars[0] is always false (no barrier before the first cluster); bars[i] for
 // i > 0 is the barrier between b_{i-1} and b_i.
 static bool collectFlatClusters(scf::ExecuteRegionOp firstExec,

From 6eac9b843a37fb722393742a63c86f7f6dbd2807 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sat, 18 Apr 2026 20:31:00 +0000
Subject: [PATCH 11/19] Simplify analyzePipelineDependencies into a single
 distance sweep

Fold the adjacent (distance-1) and longer-distance phases of the
warp-pipeline LDS dependency analysis into one loop that sweeps `dist`
from 1 to maxDist.  A single `wrap()` helper handles modular arithmetic,
and a single `isCovered()` lambda replaces the two near-identical
`isSynced` bodies for circular and linear modes.

Also drop the redundant final iteration in circular mode (the old
`offset == N - 1` step corresponds to `dist == 1` after wrap and only
re-walked already-handled adjacent pairs).

Behavior is preserved: same `(src, dst)` pairs are visited in the same
order, `barrierLoc` resolves to the same slot ((dist == 1) ? dst :
wrap(dst - 1)), and the coverage walk inspects the same `(src,
barrierLoc]` range.  No bar pattern changes.

Add a thorough doc block describing the pipeline layout, the goal,
the placement choice, the coverage check semantics, and the iteration
order, since the conventions are easy to miss.
---
 .../ConvertWarpPipeline.cpp                   | 146 ++++++++++--------
 1 file changed, 79 insertions(+), 67 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 51c4157eab21..a1072b0971e5 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -80,80 +80,92 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
   return info;
 }
 
-// Pairwise dependency analysis between pipeline clusters.
-// For each src → next pair, checks whether their memory intervals overlap.
-// If so, marks `bars[barrierLoc] = true` to indicate a fence is needed.
+// Pairwise LDS-dependency analysis between pipeline clusters.
 //
-// When `circular` is true (loop pipelines), indices wrap around modulo
-// numClusters so that the last cluster feeds back to the first.
-// When false (flat pipelines), indices are strictly linear.
+// PIPELINE LAYOUT
+// ---------------
+//   cluster:   c0    c1    c2    ...    c_{N-1}
+//   bars:    b0    b1    b2    b3   ...        b_{N-1}        (b_i sits
+//                                                              before c_i)
+//
+//   * `circular = true`  (loop pipelines): N is the number of stages and b0
+//     is the wrap-around barrier inside the loop body — sitting between
+//     c_{N-1} of one iteration and c0 of the next.
+//   * `circular = false` (flat pipelines): b0 has no physical slot (no
+//     barrier exists before the first cluster), and the schedule never
+//     wraps around.
+//
+// GOAL
+// ----
+//   For every ordered pair (src, dst) whose LDS effects intersect, guarantee
+//   that the schedule has at least one LOCAL (ds_wait + s_barrier) barrier
+//   somewhere on the path src → dst.  If no existing slot on the path is
+//   LOCAL, mark one as LOCAL.
+//
+// PLACEMENT CHOICE
+// ----------------
+//   When forced to place a LOCAL barrier we pick:
+//     dist == 1 → bars[dst]      (the only slot between src and dst)
+//     dist >  1 → bars[dst - 1]  (the second-rightmost slot on the path)
+//   The `dst - 1` choice is somewhat arbitrary — any slot in (src, dst] is
+//   correct for memory ordering — and is preserved here to match upstream
+//   behavior and existing tests.
+//
+// COVERAGE CHECK
+// --------------
+//   A pair is "covered" if any slot in (src, barrierLoc] is already LOCAL.
+//   Note that bars[dst] is intentionally NOT consulted when dist > 1; this
+//   mirrors the placement choice (we never look at, nor place into, the
+//   slot owned by the adjacent (dst-1, dst) pair).
+//
+// ITERATION ORDER
+// ---------------
+//   We sweep `dist` from 1 up to `maxDist`:
+//     * circular: maxDist = N.  dist == N is the self-loop (src == dst),
+//       which captures iter-i write vs iter-(i+1) read across the
+//       wrap-around when only one cluster touches the buffer.
+//     * flat:     maxDist = N - 1.  No wrap.
+//   Walking by increasing distance ensures the shorter-range LOCAL
+//   barriers we just placed are visible when checking longer-range pairs,
+//   skipping many redundant placements.
 static void analyzePipelineDependencies(ArrayRef<BlockInfo> clusterInfo,
                                         SmallVectorImpl<bool> &bars,
                                         Allocation *allocation, bool circular) {
-  int numClusters = clusterInfo.size();
-
-  // Adjacent-stage check (distance 1).  Every pair of consecutive clusters
-  // is separated by a barrier slot.  When adjacent stages share an LDS
-  // allocation the barrier must be LOCAL (ds_wait + s_barrier) so that
-  // ModuleMembarAnalysis — which runs later and does not understand warp
-  // pipeline phases — will not insert a redundant ttg.barrier local inside
-  // the pipelined region.
-  int adjEnd = circular ? numClusters : numClusters - 1;
-  for (int src = 0; src < adjEnd; src++) {
-    int next = circular ? (src + 1) % numClusters : src + 1;
-    int barrierLoc = next; // barrier sits right before `next`
-    if (bars[barrierLoc])
-      continue;
-    if (clusterInfo[src].isIntersected(
-            clusterInfo[next], mlir::triton::AMD::membarFilter, allocation)) {
-      bars[barrierLoc] = true;
-      LDBG("adjacent cluster " << src << " need fence to " << next
-                               << " placing barrier at " << barrierLoc);
+  const int N = clusterInfo.size();
+  const int maxDist = circular ? N : N - 1;
+
+  // Modular wrap; a no-op in linear mode where indices stay in range.
+  auto wrap = [&](int i) -> int { return circular ? (i % N + N) % N : i; };
+
+  // Returns true if any barrier slot in (src, stop] is already LOCAL.
+  // The walk starts at `src + 1` and advances one slot at a time, wrapping
+  // modulo N in circular mode; it terminates as soon as it finds a LOCAL
+  // slot or reaches `stop`.
+  auto isCovered = [&](int src, int stop) -> bool {
+    for (int i = src + 1;; i++) {
+      const int idx = wrap(i);
+      if (bars[idx])
+        return true;
+      if (idx == stop)
+        return false;
     }
-  }
-
-  // Distance-2+ check.  For each (src, next) pair separated by at least one
-  // intermediate barrier position, verify the dependency is already covered
-  // by an intervening LOCAL barrier; if not, mark the closest position.
-  for (int offset = 0; offset < numClusters; offset++) {
-    for (int src = 0; src < numClusters; src++) {
-      int next, barrierLoc;
-      if (circular) {
-        next = (src + 2 + offset) % numClusters;
-        barrierLoc = (src + 1 + offset) % numClusters;
-      } else {
-        next = src + 2 + offset;
-        barrierLoc = src + 1 + offset;
-        if (next >= numClusters || barrierLoc >= numClusters)
-          continue;
-      }
+  };
 
-      auto isSynced = [&]() -> bool {
-        if (circular) {
-          for (int idx = (src + 1) % numClusters; idx != src;
-               idx = (idx + 1) % numClusters) {
-            if (bars[idx])
-              return true;
-            if (idx == barrierLoc)
-              break;
-          }
-        } else {
-          for (int idx = src + 1; idx <= barrierLoc; idx++)
-            if (bars[idx])
-              return true;
-        }
-        return false;
-      };
-      if (isSynced())
+  for (int dist = 1; dist <= maxDist; dist++) {
+    // In linear mode, src + dist must stay in range.  In circular mode all
+    // src values are valid and dst wraps modulo N.
+    const int srcEnd = circular ? N : N - dist;
+    for (int src = 0; src < srcEnd; src++) {
+      const int dst = wrap(src + dist);
+      const int barrierLoc = (dist == 1) ? dst : wrap(dst - 1);
+      if (isCovered(src, barrierLoc))
         continue;
-
-      const bool needFence = clusterInfo[src].isIntersected(
-          clusterInfo[next], mlir::triton::AMD::membarFilter, allocation);
-      if (needFence) {
-        bars[barrierLoc] = true;
-        LDBG("cluster " << src << " need fence to " << next
-                        << " placing barrier at " << barrierLoc);
-      }
+      if (!clusterInfo[src].isIntersected(
+              clusterInfo[dst], mlir::triton::AMD::membarFilter, allocation))
+        continue;
+      bars[barrierLoc] = true;
+      LDBG("cluster " << src << " need fence to " << dst
+                      << " placing barrier at " << barrierLoc);
     }
   }
 }

From 437a6e1c5b41123819d2300f5b82aadc22c9617d Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sat, 18 Apr 2026 22:08:48 +0000
Subject: [PATCH 12/19] Tidy comments for terminology and section consistency

- Drop the stale "distance-2+ check" reference in isCrossPipelineSafe;
  reword to match the unified single-distance sweep in
  analyzePipelineDependencies.
- Make emitPipelinedFor and emitPipelinedFlat use parallel section
  numbering (1..5) and parallel "Circular ..." / "Linear ..." headings
  for the dependency-analysis step.
- Remove a duplicate analysis comment inside emitPipelinedFlat.
---
 .../ConvertWarpPipeline.cpp                   | 50 +++++++++++--------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index a1072b0971e5..656801b84e95 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -82,18 +82,24 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
 
 // Pairwise LDS-dependency analysis between pipeline clusters.
 //
-// PIPELINE LAYOUT
-// ---------------
+// `circular` selects the index topology used by the analysis:
+//   * true  — the schedule wraps modulo N.  Used by loop pipelines (scf.for)
+//             where the wrap-around represents iter-i feeding iter-(i+1).
+//   * false — the schedule is straight-line, indices stay in [0, N).  Used
+//             by flat (unrolled) pipelines.
+// The rest of this comment uses "circular" / "linear" exclusively, since
+// the analysis only cares about topology and not about the source IR kind.
+//
+// LAYOUT
+// ------
 //   cluster:   c0    c1    c2    ...    c_{N-1}
 //   bars:    b0    b1    b2    b3   ...        b_{N-1}        (b_i sits
 //                                                              before c_i)
 //
-//   * `circular = true`  (loop pipelines): N is the number of stages and b0
-//     is the wrap-around barrier inside the loop body — sitting between
-//     c_{N-1} of one iteration and c0 of the next.
-//   * `circular = false` (flat pipelines): b0 has no physical slot (no
-//     barrier exists before the first cluster), and the schedule never
-//     wraps around.
+//   * circular: b0 is the wrap-around barrier inside the loop body —
+//     sitting between c_{N-1} of one iteration and c0 of the next.
+//   * linear:   b0 has no physical slot (no barrier exists before the first
+//     cluster), and the schedule never wraps around.
 //
 // GOAL
 // ----
@@ -124,7 +130,7 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
 //     * circular: maxDist = N.  dist == N is the self-loop (src == dst),
 //       which captures iter-i write vs iter-(i+1) read across the
 //       wrap-around when only one cluster touches the buffer.
-//     * flat:     maxDist = N - 1.  No wrap.
+//     * linear:   maxDist = N - 1.  No wrap.
 //   Walking by increasing distance ensures the shorter-range LOCAL
 //   barriers we just placed are visible when checking longer-range pairs,
 //   skipping many redundant placements.
@@ -362,7 +368,7 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
       }
     }
 
-    // Post-loop priority reset and reconverge.
+    // 5. Post-loop priority reset and reconverge.
     b.setInsertionPointAfter(forOp);
     emitPipelinePostlude(b, loc, anyHasPriority, warpLow);
     return success();
@@ -438,7 +444,7 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
   auto [warpLow, warpHigh] =
       emitPipelinePrelude(b, loc, threadsPerPipelineGroup);
 
-  // 2. Dependency analysis — linear, no wrap-around.
+  // 2. Collect cluster info.
   SmallVector<Block *> clusterBlocks;
   SmallVector<bool> bars(numClusters, false);
 
@@ -455,11 +461,11 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
     return op->hasAttr("triton.warp_pipeline.priority");
   });
 
-  // Linear dependency analysis (no wrap-around for flat pipelines).
+  // 3. Linear dependency analysis (no wrap-around for flat pipelines).
   analyzePipelineDependencies(clusterInfo, bars, allocation,
                               /*circular=*/false);
 
-  // 3. Materialize cluster barriers.
+  // 4. Materialize cluster barriers.
   //    Cluster 0 gets only its priority (inserted after cond_barrier above).
   //    Clusters 1..N get priority + cluster barrier, unless a pre-existing
   //    barrier op (e.g., async_wait) already exists between the clusters —
@@ -491,7 +497,7 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
     }
   }
 
-  // 4. Post-sequence reconverge.
+  // 5. Post-sequence reconverge.
   b.setInsertionPointAfter(clusterOps.back());
   emitPipelinePostlude(b, loc, anyHasPriority, warpLow);
 }
@@ -663,15 +669,16 @@ static bool collectNextPipelineClusters(Operation *startOp,
 
 // Check whether merging two pipelines creates a cross-pipeline LDS dependency
 // at the boundary.  Concatenates the cluster infos and barrier flags from both
-// pipelines and runs analyzePipelineDependencies on the merged linear
+// pipelines and runs analyzePipelineDependencies in linear mode on the merged
 // sequence.
 //
-// Note on concurrency vs memory ordering: with a one-stage phase offset, the
-// only cross-warp concurrent pair at the boundary is (a_{K-1}, b_0) — all
+// Note on concurrency vs memory ordering: with a one-stage phase offset the
+// only cross-warp concurrent pair at the boundary is (a_{K-1}, b_0); all
 // other pairs execute sequentially within the same warp.  However, within a
-// warp, LDS write→read ordering still requires a LOCAL barrier (ds_wait)
-// between producer and consumer.  The distance-2+ check verifies that such a
-// barrier exists somewhere along the path.
+// warp LDS write→read ordering still requires a LOCAL barrier (ds_wait)
+// between producer and consumer, so the merged analysis must check every
+// (a_i, b_j) pair, not just the concurrent one.  The single-distance sweep
+// inside analyzePipelineDependencies covers both cases uniformly.
 //
 // Returns true if the boundary position stays dependency-free after analysis
 // (i.e. safe to eliminate).
@@ -693,7 +700,8 @@ static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
   // mergedBars[i] = LOCAL barrier immediately before cluster i.
   //   i < K     : A's internal barriers (loopBars[i]).  loopBars[0]
   //               corresponds to A's wrap-around inside the loop body and is
-  //               never consulted by the linear distance checks (src ≥ 0).
+  //               never consulted in linear mode (analyzePipelineDependencies
+  //               only reads bars[idx] for idx > src ≥ 0).
   //   i == K    : boundary — initialized false; this is what we decide.
   //   i > K     : B's internal barriers (nextBars[i - K]).  nextBars[0] is
   //               skipped: for flat B it is always false, for loop B it is

From 138295b100e41e75dc6ffea967b816d1463d338e Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 26 Apr 2026 20:21:52 +0000
Subject: [PATCH 13/19] WarpPipeliner: share helpers between createPipeline and
 createFlatPipeline

---
 .../TritonAMDGPUTransforms/WarpPipeliner.cpp  | 105 +++++++++---------
 1 file changed, 50 insertions(+), 55 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index 469895b64d68..5cdc5ae5a8f0 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -25,6 +25,46 @@ namespace mlir {
 #define GEN_PASS_DEF_TRITONAMDGPUWARPPIPELINE
 #include "TritonAMDGPUTransforms/Passes.h.inc"
 
+// Ops that may appear between pipeline stages but never inside one.  Pre-
+// existing memory-fence/wait ops at cluster boundaries are tolerated so that
+// prefetch patterns continue to work; encountering one mid-cluster is treated
+// as a pattern mismatch by the callers.
+static bool isPipelineIgnorable(Operation *op) {
+  return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
+             tt::amdgpu::AsyncTDMWait>(op);
+}
+
+// True if `op` carries the cluster-end marker emitted by the frontend.
+static bool isPipelineBorder(Operation *op) {
+  return op->hasAttr("triton.warp_pipeline.border");
+}
+
+// Read (cluster-name, priority) from a border marker op.  Priority defaults
+// to -1 when the marker doesn't carry the optional priority attribute.
+static std::pair<StringAttr, int> readBorderMarker(Operation *op) {
+  StringAttr clusterStr =
+      op->getAttrOfType<StringAttr>("triton.warp_pipeline.border");
+  int priority = -1;
+  if (auto intAttr =
+          op->getAttrOfType<IntegerAttr>("triton.warp_pipeline.priority"))
+    priority = intAttr.getInt();
+  return {clusterStr, priority};
+}
+
+// If `cluster` is empty, materialize a dummy SchedBarrier so the cluster is
+// non-empty.  This lets users deliberately request a pipeline bubble by
+// emitting two consecutive border markers with no body between them.
+static void addDummyOpIfEmptyCluster(OpBuilder &b, Location loc,
+                                     Operation *insertBefore,
+                                     SmallVectorImpl<Operation *> &cluster) {
+  if (!cluster.empty())
+    return;
+  b.setInsertionPoint(insertBefore);
+  auto dummyOp = ROCDL::SchedBarrier::create(b, loc, 0);
+  dummyOp->setAttr("triton.warp_pipeline.empty_cluster", b.getUnitAttr());
+  cluster.push_back(dummyOp);
+}
+
 // Create a scf.execute_region op representing a pipeline cluster.
 static void createClusterOp(OpBuilder &b, Location loc,
                             SmallVector<Operation *> &ops,
@@ -118,43 +158,18 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
   SmallVector<SmallVector<Operation *>> clusters;
   auto ctx = forOp.getContext();
 
-  // ops cannot be located within a cluster
-  // barrier/wait still require border op
-  auto isIgnorable = [](Operation *op) {
-    return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
-               tt::amdgpu::AsyncTDMWait>(op);
-  };
-
-  auto isBorder = [](Operation *op) {
-    return op->hasAttr("triton.warp_pipeline.border");
-  };
-
   // One pass over the body; collect clusters split by explicit borders.
   for (Operation &opRef : llvm::make_early_inc_range(blk)) {
     Operation *op = &opRef;
-    if (isBorder(op)) { // Wrap-up one cluster at a border.
-      StringAttr clusterStr =
-          op->getAttrOfType<StringAttr>("triton.warp_pipeline.border");
-      int priority = -1;
-      if (auto intAttr =
-              op->getAttrOfType<IntegerAttr>("triton.warp_pipeline.priority")) {
-        priority = intAttr.getInt();
-      }
-      clusterMarkers.push_back({clusterStr, priority});
-      if (cluster.empty()) {
-        // This allows user to deliberately insert a pipeline bubble with a
-        // cluster only contains a dummy operation.
-        b.setInsertionPoint(op);
-        auto dummyOp = ROCDL::SchedBarrier::create(b, loc, 0);
-        dummyOp->setAttr("triton.warp_pipeline.empty_cluster", b.getUnitAttr());
-        cluster.push_back(dummyOp);
-      }
+    if (isPipelineBorder(op)) { // Wrap-up one cluster at a border.
+      clusterMarkers.push_back(readBorderMarker(op));
+      addDummyOpIfEmptyCluster(b, loc, op, cluster);
       clusters.push_back(std::move(cluster));
       cluster.clear();
       op->erase(); // remove the marker
       continue;
     }
-    if (isIgnorable(op)) {
+    if (isPipelineIgnorable(op)) {
       // Ignorable ops may appear before or after a stage, but not inside it.
       // If encountered while building an execute_region, reject warp-pipeline.
       if (!cluster.empty())
@@ -199,17 +214,9 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
 // annotations producing border markers.  The grouping logic mirrors
 // createPipeline but without a loop wrapper.
 static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
-  auto isIgnorable = [](Operation *op) {
-    return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
-               tt::amdgpu::AsyncTDMWait>(op);
-  };
-  auto isBorder = [](Operation *op) {
-    return op->hasAttr("triton.warp_pipeline.border");
-  };
-
   SmallVector<Operation *> allBorders;
   for (auto &op : block)
-    if (isBorder(&op))
+    if (isPipelineBorder(&op))
       allBorders.push_back(&op);
 
   if (allBorders.size() < 2)
@@ -226,7 +233,7 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
   for (Operation *op = firstBorder->getPrevNode(); op; op = op->getPrevNode()) {
     if (isa<scf::ForOp>(op) || isa<tt::amdgpu::CondBarrierOp>(op))
       break;
-    if (isIgnorable(op))
+    if (isPipelineIgnorable(op))
       break;
     regionStart = op;
   }
@@ -239,21 +246,9 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
     Operation *op = &*it;
     ++it;
 
-    if (isBorder(op)) {
-      StringAttr clusterStr =
-          op->getAttrOfType<StringAttr>("triton.warp_pipeline.border");
-      int priority = -1;
-      if (auto intAttr =
-              op->getAttrOfType<IntegerAttr>("triton.warp_pipeline.priority"))
-        priority = intAttr.getInt();
-      clusterMarkers.push_back({clusterStr, priority});
-
-      if (cluster.empty()) {
-        b.setInsertionPoint(op);
-        auto dummyOp = ROCDL::SchedBarrier::create(b, loc, 0);
-        dummyOp->setAttr("triton.warp_pipeline.empty_cluster", b.getUnitAttr());
-        cluster.push_back(dummyOp);
-      }
+    if (isPipelineBorder(op)) {
+      clusterMarkers.push_back(readBorderMarker(op));
+      addDummyOpIfEmptyCluster(b, loc, op, cluster);
       clusters.push_back(std::move(cluster));
       cluster.clear();
 
@@ -264,7 +259,7 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
       continue;
     }
 
-    if (isIgnorable(op)) {
+    if (isPipelineIgnorable(op)) {
       if (!cluster.empty())
         return failure();
       continue;

From 4b3c2dee91d7369b89ef108319307111c4236d99 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 26 Apr 2026 20:26:53 +0000
Subject: [PATCH 14/19] ConvertWarpPipeline: introduce
 isWarpPipelineIgnorableBarrier and getPipelineStage helpers

---
 .../ConvertWarpPipeline.cpp                   | 61 +++++++++++--------
 1 file changed, 34 insertions(+), 27 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 656801b84e95..a1ce910b3559 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -80,6 +80,28 @@ static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
   return info;
 }
 
+// Pre-existing barrier/wait ops that may legally appear at cluster
+// boundaries (between stages or before/after a pipeline).  Mirrors
+// isPipelineIgnorable in WarpPipeliner.cpp plus the ROCDL-lowered forms that
+// can appear after intermediate passes.
+static bool isWarpPipelineIgnorableBarrier(Operation *op) {
+  return isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
+             triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
+             triton::amdgpu::AsyncTDMIntrinsicWait>(op);
+}
+
+// True if `exec` is a stage created by the warp-pipeline frontend.
+static bool isPipelineStage(scf::ExecuteRegionOp exec) {
+  return exec && exec->hasAttr("triton.warp_pipeline.stage");
+}
+
+// dyn_cast<scf::ExecuteRegionOp> + warp_pipeline.stage marker check.
+// Returns null when `op` is not a pipeline stage.
+static scf::ExecuteRegionOp getPipelineStage(Operation *op) {
+  auto exec = dyn_cast_or_null<scf::ExecuteRegionOp>(op);
+  return isPipelineStage(exec) ? exec : nullptr;
+}
+
 // Pairwise LDS-dependency analysis between pipeline clusters.
 //
 // `circular` selects the index topology used by the analysis:
@@ -281,15 +303,13 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     for (auto &op : *forOp.getBody()) {
       if (auto exeOp = dyn_cast<scf::ExecuteRegionOp>(op)) {
         // Fail conversion with executeRegion from unkown source.
-        if (exeOp->getAttr("triton.warp_pipeline.stage") == nullptr)
+        if (!isPipelineStage(exeOp))
           return failure();
         exeOp.setNoInline(false);
         clusterOps.push_back(&op);
         clusterBlocks.push_back(&exeOp->getRegion(0).front());
         bars.push_back(false);
-      } else if (isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
-                     triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
-                     triton::amdgpu::AsyncTDMIntrinsicWait>(op)) {
+      } else if (isWarpPipelineIgnorableBarrier(&op)) {
         int currCluster = clusterBlocks.size();
         // Reject if multiple barriers appear without an intervening cluster.
         // This is functionally valid but may cause unpredictable timing. Users
@@ -390,7 +410,7 @@ class InlineWarpPipelineExecuteRegionPattern
       return rewriter.notifyMatchFailure(exec, "explicit no_inline");
 
     // Only inline the stages created by the warp-pipeline frontend.
-    if (!exec->getAttr("triton.warp_pipeline.stage"))
+    if (!isPipelineStage(exec))
       return rewriter.notifyMatchFailure(exec, "not a warp-pipeline stage");
 
     // Make sure this pattern is applied after transforming pipelined forOp
@@ -476,9 +496,7 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
     Operation *existingBarrier = nullptr;
     for (Operation *op = clusterOps[i - 1]->getNextNode();
          op && op != clusterOps[i].getOperation(); op = op->getNextNode()) {
-      if (isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
-              triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
-              triton::amdgpu::AsyncTDMIntrinsicWait>(op)) {
+      if (isWarpPipelineIgnorableBarrier(op)) {
         existingBarrier = op;
         break;
       }
@@ -508,12 +526,6 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
 static void processUnrolledPipelineRegions(ModuleOp m,
                                            ModuleAllocation &moduleAllocation,
                                            int threadsPerPipelineGroup) {
-  auto isIgnorable = [](Operation *op) {
-    return isa<ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::AsyncWaitOp,
-               triton::amdgpu::AsyncWaitOp, triton::amdgpu::AsyncTDMWait,
-               triton::amdgpu::AsyncTDMIntrinsicWait>(op);
-  };
-
   m.walk([&](triton::FuncOp funcOp) {
     Allocation *allocation = moduleAllocation.getFuncData(funcOp);
     if (!allocation)
@@ -526,14 +538,13 @@ static void processUnrolledPipelineRegions(ModuleOp m,
       SmallVector<scf::ExecuteRegionOp> current;
 
       for (auto &op : block) {
-        if (auto exec = dyn_cast<scf::ExecuteRegionOp>(&op)) {
-          if (exec->hasAttr("triton.warp_pipeline.stage") &&
-              !isa<scf::ForOp>(exec->getParentOp())) {
+        if (auto exec = getPipelineStage(&op)) {
+          if (!isa<scf::ForOp>(exec->getParentOp())) {
             current.push_back(exec);
             continue;
           }
         }
-        if (isIgnorable(&op))
+        if (isWarpPipelineIgnorableBarrier(&op))
           continue;
         if (!current.empty()) {
           sequences.push_back(std::move(current));
@@ -593,9 +604,7 @@ static bool collectLoopClusters(scf::ForOp forOp,
   if (!yieldOp)
     return false;
   for (auto &op : *forOp.getBody()) {
-    if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op)) {
-      if (!exec->hasAttr("triton.warp_pipeline.stage"))
-        continue;
+    if (auto exec = getPipelineStage(&op)) {
       blocks.push_back(&exec->getRegion(0).front());
       bars.push_back(false);
     }
@@ -630,14 +639,13 @@ static bool collectLoopClusters(scf::ForOp forOp,
 static bool collectFlatClusters(scf::ExecuteRegionOp firstExec,
                                 SmallVectorImpl<Block *> &blocks,
                                 SmallVectorImpl<bool> &bars) {
-  if (!firstExec->hasAttr("triton.warp_pipeline.stage"))
+  if (!isPipelineStage(firstExec))
     return false;
   blocks.push_back(&firstExec->getRegion(0).front());
   bars.push_back(false);
 
   for (Operation *op = firstExec->getNextNode(); op; op = op->getNextNode()) {
-    if (auto exec = dyn_cast<scf::ExecuteRegionOp>(op);
-        exec && exec->hasAttr("triton.warp_pipeline.stage")) {
+    if (auto exec = getPipelineStage(op)) {
       blocks.push_back(&exec->getRegion(0).front());
       bars.push_back(hasLocalBarrierBefore(op));
       continue;
@@ -790,9 +798,8 @@ static void eliminateRedundantCondBarriers(ModuleOp m,
         Operation *next = preLoopCB->getNextNode();
         if (next && isa<ROCDL::SetPrioOp>(next))
           next = next->getNextNode();
-        bool nextIsPipeline = isa_and_nonnull<scf::ForOp>(next) ||
-                              (isa_and_nonnull<scf::ExecuteRegionOp>(next) &&
-                               next->hasAttr("triton.warp_pipeline.stage"));
+        bool nextIsPipeline =
+            isa_and_nonnull<scf::ForOp>(next) || getPipelineStage(next);
         if (!nextIsPipeline)
           continue;
 

From 60c50fcc30b31f9caaf9083abdd384e1986f3e44 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Sun, 26 Apr 2026 20:27:56 +0000
Subject: [PATCH 15/19] WarpPipeliner: add step-numbered comments to
 createFlatPipeline

---
 .../lib/TritonAMDGPUTransforms/WarpPipeliner.cpp   | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index 5cdc5ae5a8f0..a1015c90f5bb 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -214,6 +214,8 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
 // annotations producing border markers.  The grouping logic mirrors
 // createPipeline but without a loop wrapper.
 static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
+  // 1. Find all border markers in this block.  Need at least two to form
+  //    a pipeline (one per stage boundary).
   SmallVector<Operation *> allBorders;
   for (auto &op : block)
     if (isPipelineBorder(&op))
@@ -226,9 +228,10 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
   Operation *firstBorder = allBorders.front();
   Operation *lastBorder = allBorders.back();
 
-  // Walk backwards from the first border to find the start of the first
-  // stage.  Stop at control-flow boundaries (scf.for, cond_barrier) or
-  // ignorable ops that logically belong to a previous pipeline.
+  // 2. Locate the start of the first stage.  Unlike createPipeline, the flat
+  //    sequence has no loop body to anchor against, so walk backwards from
+  //    the first border, stopping at control-flow boundaries (scf.for,
+  //    cond_barrier) or ignorable ops belonging to a previous pipeline.
   Operation *regionStart = firstBorder;
   for (Operation *op = firstBorder->getPrevNode(); op; op = op->getPrevNode()) {
     if (isa<scf::ForOp>(op) || isa<tt::amdgpu::CondBarrierOp>(op))
@@ -238,6 +241,9 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
     regionStart = op;
   }
 
+  // 3. Sweep forward from regionStart, splitting ops into clusters at each
+  //    border.  Mirrors createPipeline's main loop, but bounded by lastBorder
+  //    instead of scf.yield.
   SmallVector<Operation *> cluster;
   SmallVector<std::pair<StringAttr, int>> clusterMarkers;
   SmallVector<SmallVector<Operation *>> clusters;
@@ -268,6 +274,8 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
     cluster.push_back(op);
   }
 
+  // 4. Materialize each cluster as an execute_region.  Bail out if fewer than
+  //    two real clusters survived (e.g., dummies-only).
   if (clusters.size() < 2)
     return failure();
 

From 611167a1fec8dab7620cf5e7baf4f7e52a1dabb3 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Mon, 27 Apr 2026 17:27:46 +0000
Subject: [PATCH 16/19] address review comments

---
 .../amd-convert-warp-pipeline-invalid.mlir    | 134 ++++++++
 .../amd/amd-convert-warp-pipeline.mlir        | 278 ++++++++++++++---
 .../amd/amd-warp-pipeline-invalid.mlir        | 117 +++++++
 test/TritonGPU/amd/amd-warp-pipeline.mlir     |  21 +-
 .../ConvertWarpPipeline.cpp                   | 290 ++++++++++++------
 .../TritonAMDGPUTransforms/WarpPipeliner.cpp  | 180 ++++++++---
 6 files changed, 839 insertions(+), 181 deletions(-)
 create mode 100644 test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
 create mode 100644 test/TritonGPU/amd/amd-warp-pipeline-invalid.mlir

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
new file mode 100644
index 000000000000..81db83cd436b
--- /dev/null
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
@@ -0,0 +1,134 @@
+// RUN: triton-opt %s -split-input-file -convert-warp-pipeline="arch=gfx950" -verify-diagnostics
+
+// validatePipelinedForBody runs upfront, before any IR mutation, so a
+// malformed `pipelined_for` body fails the pass with no partial conversion.
+
+// ==== Non-warp-pipeline scf.execute_region inside a pipelined_for body ====
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @bad_unmarked_execute_region(%n: index, %ptr: !tt.ptr<f32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    scf.for %i = %c0 to %n step %c1 {
+      scf.execute_region {
+        tt.store %ptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage0"}
+
+      // expected-error @+1 {{non-warp-pipeline scf.execute_region inside pipelined_for body}}
+      scf.execute_region {
+        tt.store %ptr, %v1 : !tt.ptr<f32>
+        scf.yield
+      }
+
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    tt.return
+  }
+}
+
+// -----
+
+// ==== Multiple pre-existing barriers between two stages ====
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @bad_double_barrier_between_stages(%n: index, %ptr: !tt.ptr<f32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    scf.for %i = %c0 to %n step %c1 {
+      scf.execute_region {
+        tt.store %ptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage0"}
+
+      amdg.async_wait {num_inst = 0 : i32}
+      // expected-error @+1 {{multiple pre-existing barriers between pipeline stages}}
+      amdg.async_wait {num_inst = 0 : i32}
+
+      scf.execute_region {
+        tt.store %ptr, %v1 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage1"}
+
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    tt.return
+  }
+}
+
+// -----
+
+// ==== Both top-of-loop and bottom-of-loop pre-existing barriers ====
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @bad_top_and_bottom_barriers(%n: index, %ptr: !tt.ptr<f32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    // expected-error @+1 {{both top-of-loop and bottom-of-loop pre-existing barriers}}
+    scf.for %i = %c0 to %n step %c1 {
+      amdg.async_wait {num_inst = 0 : i32}
+
+      scf.execute_region {
+        tt.store %ptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage0"}
+
+      scf.execute_region {
+        tt.store %ptr, %v1 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage1"}
+
+      amdg.async_wait {num_inst = 0 : i32}
+
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    tt.return
+  }
+}
+
+// -----
+
+// ==== Unexpected op inside a pipelined_for body ====
+//
+// Anything that is not a warp-pipeline stage, an ignorable barrier/wait,
+// or scf.yield must be rejected upfront.
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @bad_unexpected_op_in_body(%n: index, %ptr: !tt.ptr<f32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %v0 = arith.constant 0.0 : f32
+    %v1 = arith.constant 1.0 : f32
+
+    scf.for %i = %c0 to %n step %c1 {
+      scf.execute_region {
+        tt.store %ptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage0"}
+
+      // expected-error @+1 {{unexpected op inside pipelined_for body}}
+      %x = arith.addi %i, %c1 : index
+
+      scf.execute_region {
+        tt.store %ptr, %v1 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "stage1"}
+
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
index 8923762dc8e2..dc633d40946c 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline.mlir
@@ -448,19 +448,25 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 
 // -----
 
-// ---- Back-to-back: cross-pipeline LDS dependency prevents elimination ----
+// ---- Back-to-back: cross-pipeline LDS dep covered by A's wrap-around ----
 //
-// Both loops access the same shared buffer (read + write).  When merged,
-// warp0 at b0 (reads smem) runs concurrently with warp1 at a1 (writes smem)
-// — a RAW hazard.  The boundary barriers must be kept.
+// Both loops access the same shared buffer (read + write).  Loop 1's
+// stage1 writes smem and loop 2's stage0 reads it — a cross-pipeline RAW.
+//
+// Loop 1's wrap-around barrier (bars[0]) is LOCAL because of the in-loop
+// RAW between stage1 (write) and the next iteration's stage0 (read).
+// That barrier physically sits at the bottom of loop 1's body and is the
+// most recent LDS sync after the loop exits, so it already covers the
+// (a1, b0) cross-pipeline dep at the boundary.  The boundary barriers
+// can therefore be eliminated.
 //
 // Expected:
 //   ttg.barrier local          (pre-barrier for loop 1)
 //   amdg.cond_barrier          (#1 phase shift for loop 1)
 //   scf.for { loop 1 }
-//   amdg.cond_barrier          (#2 post-loop reconverge — kept)
-//   ttg.barrier local          (pre-barrier for loop 2 — kept)
-//   amdg.cond_barrier          (#3 phase shift for loop 2 — kept)
+//   NO amdg.cond_barrier       (#2 eliminated — wrap-around covers)
+//   NO ttg.barrier local       (prelude eliminated)
+//   NO amdg.cond_barrier       (#3 eliminated)
 //   scf.for { loop 2 }
 //   amdg.cond_barrier          (#4 post-loop reconverge for loop 2)
 
@@ -469,7 +475,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 #b2b_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #b2b_smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
-  tt.func @back_to_back_cross_dep_kept(
+  tt.func @back_to_back_wrap_around_covers_dep(
       %lb: i32, %ub: i32, %step: i32,
       %acc: tensor<256x256xf32, #b2b_mma>,
       %ptr: tensor<256x64x!tt.ptr<f16>, #b2b_blocked>) {
@@ -519,19 +525,18 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
   }
 }
 
-// CHECK-LABEL: tt.func @back_to_back_cross_dep_kept
+// CHECK-LABEL: tt.func @back_to_back_wrap_around_covers_dep
 // Pre-barrier and phase shift for loop 1.
 // CHECK: ttg.barrier local
 // CHECK: amdg.cond_barrier
 // CHECK: scf.for
-// Wrap-around barrier inside loop 1.
+// Wrap-around barrier inside loop 1 (LOCAL — covers cross-pipeline dep).
 // CHECK: ttg.barrier local
 // CHECK: scf.yield
-// Cross-pipeline LDS dependency (a1 writes smem, b0 reads smem) →
-// barriers between the two loops are KEPT.
-// CHECK: amdg.cond_barrier
-// CHECK: ttg.barrier local
-// CHECK: amdg.cond_barrier
+// Boundary barriers are eliminated: A's wrap-around already provides the
+// LDS sync needed for loop 2's first read; phase carries over.
+// CHECK-NOT: amdg.cond_barrier
+// CHECK-NOT: ttg.barrier local
 // CHECK: scf.for
 // Post-loop reconverge for loop 2.
 // CHECK: amdg.cond_barrier
@@ -1100,50 +1105,59 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // were wrongly eliminated.
 //
 // Layout:
-//   Loop A (2 stages): a_0 tt.store          (no LDS)
-//                      a_1 ttg.local_store   (WRITES LDS)
-//   Flat B (2 stages): b_0 tt.store          (no LDS)
-//                      b_1 ttg.local_store   (WRITES the same LDS buffer)
+//   Loop A (2 stages): a_0 tt.store         (no LDS)
+//                      a_1 ttg.local_load   (READS LDS)
+//   Flat B (2 stages): b_0 tt.store         (no LDS)
+//                      b_1 ttg.local_store  (WRITES the same LDS buffer)
 //
-// A's circular analysis places its single LOCAL barrier at bars[0] (the
-// wrap-around) because the only intersecting pair is (a_1, a_1) via the
-// distance-2 self-check.  Its internal barrier between stages (bars[1]) is
-// therefore s_barrier and does NOT cover the cross-pipeline dep.
+// A's circular analysis finds no intersecting pair (a_1's read does not
+// conflict with itself or with a_0), so all of A's bars are non-LOCAL.
+// In particular the wrap-around bars[0] is FALSE, so it cannot seed
+// coverage for the merged boundary slot.
 //
-// Cross-pipeline dep: (a_1, b_1) WAW at merged distance 2, barrierLoc = K = 2
-// (the boundary).  With all other barrier slots non-LOCAL in the merged
-// sequence, the analysis must flag the boundary and preserve the post-loop
+// Cross-pipeline dep: (a_1, b_1) WAR at merged distance 2, barrierLoc = K = 2
+// (the boundary).  No other slot on the path from a_1 to b_1 is LOCAL, so
+// the analysis must flag the boundary and preserve the post-loop
 // cond_barrier, prelude ttg.barrier local, and phase-shift cond_barrier.
 //
-// Before the fix the boundary barriers would have been removed (false
-// negative) because only b_0 was collected, making b_1 invisible to the
-// cross-pipeline analysis.
+// Before the collectNextPipelineClusters fix, the boundary barriers would
+// have been removed (false negative) because only b_0 was collected, making
+// b_1 invisible to the cross-pipeline analysis.
 
 #crossb_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#crossb_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
 #crossb_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
 #crossb_smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
   tt.func @cross_pipeline_dep_in_b1(
       %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>,
       %ptr: tensor<256x64x!tt.ptr<f16>, #crossb_blocked>,
-      %gptr: !tt.ptr<f32>) {
+      %gptr: !tt.ptr<f32>,
+      %dst: tensor<256x16x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>) {
 
     %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
     %v0 = arith.constant 0.0 : f32
     %v1 = arith.constant 1.0 : f32
 
-    // Loop A: stage 0 no LDS, stage 1 writes %smem.
-    scf.for %i = %lb to %ub step %step : i32 {
+    // Loop A: stage 0 no LDS, stage 1 reads %smem.  The loaded value is
+    // threaded through iter_args + used after the loop so the execute_region
+    // (and its ttg.local_load) survives DCE before the redundant-barrier pass.
+    %final = scf.for %i = %lb to %ub step %step
+        iter_args(%cur = %acc)
+        -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>> : i32 {
       scf.execute_region no_inline {
         tt.store %gptr, %v0 : !tt.ptr<f32>
         scf.yield
       } {triton.warp_pipeline.stage = "a_compute"}
 
-      scf.execute_region no_inline {
-        %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #crossb_blocked>
-        ttg.local_store %data, %smem : tensor<256x64xf16, #crossb_blocked> -> !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
-        scf.yield
-      } {triton.warp_pipeline.stage = "a_store"}
+      %ld = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %smem[0, 0] : !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable> -> !ttg.memdesc<256x16xf16, #crossb_shared, #crossb_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #crossb_shared, #crossb_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "a_load"}
+
+      scf.yield %ld : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>
     } {triton.warp_pipeline.pipelined_for}
 
     // Flat B: b_0 no LDS (masks the bug), b_1 writes the same %smem (dep).
@@ -1158,6 +1172,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
       scf.yield
     } {triton.warp_pipeline.stage = "b_lds"}
 
+    // Use %final after flat B so the loop's iter_arg result is observed and
+    // the local_load execute_region survives DCE — without breaking the
+    // back-to-back boundary between loop A and flat B.
+    tt.store %dst, %final : tensor<256x16x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #crossb_mma, kWidth = 4}>>
+
     ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #crossb_shared, #crossb_smem, mutable>
     tt.return
   }
@@ -1168,13 +1187,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // CHECK: ttg.barrier local
 // CHECK: amdg.cond_barrier
 // CHECK: scf.for
-// Loop body: a_0 (tt.store), internal s_barrier, a_1 (local_store), wrap-around LOCAL.
+// Loop body: a_0 (tt.store), internal s_barrier, a_1 (local_load).
 // CHECK: tt.store
-// CHECK: ttg.local_store
-// CHECK: ttg.barrier local
+// CHECK: rocdl.s.barrier
+// CHECK: ttg.local_load
 // Boundary barriers between loop A and flat B are KEPT because (a_1, b_1)
-// is a cross-pipeline WAW dep on %smem and A's internal barriers do NOT
-// cover the path a_1 → boundary → b_0 → b_1.
+// is a cross-pipeline WAR dep on %smem and no LOCAL barrier on the path
+// a_1 → boundary → b_0 → b_1 covers it (A's wrap-around is not LOCAL).
 // CHECK: amdg.cond_barrier
 // CHECK: ttg.barrier local
 // CHECK: amdg.cond_barrier
@@ -1184,3 +1203,178 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
 // Reconverge cond_barrier for flat B.
 // CHECK: amdg.cond_barrier
 // CHECK: tt.return
+
+// -----
+
+// ---- Back-to-back: cross-pipeline dep where placement falls inside A ----
+//
+// Companion to @cross_pipeline_dep_in_b1.  Where that test puts the
+// uncovered cross-pipeline pair at distance == 1 (so the placement falls at
+// boundary slot K), this one engineers a pair at distance == K from `a_0`
+// to `b_0` so the placement falls at slot K-1 — *inside* A's body.
+// isCrossPipelineSafe must still flag this as unsafe: the explicit
+// cross-pipeline-pair sweep walks (src, barrierLoc] for coverage and finds
+// no LOCAL slot in A (loopBars[1..K-1] are all false).
+//
+// Layout:
+//   Loop A (2 stages): a_0 ttg.local_load   (READS LDS)
+//                      a_1 tt.store         (no LDS)
+//   Flat B (2 stages): b_0 ttg.local_store  (WRITES the same LDS buffer)
+//                      b_1 tt.store         (no LDS)
+//
+// A's circular analysis: a_0 read-read with itself, no intersection with a_1;
+// loopBars = [false, false] and the wrap-around is non-LOCAL.
+//
+// Cross-pipeline dep (a_0, b_0) WAR on %smem at merged distance K=2 →
+// barrierLoc = dst-1 = 1.  isCovered(0, 1) walks slot 1 (loopBars[1]=false)
+// and returns false; the pair is intersected → unsafe.  Boundary barriers
+// must be kept.
+
+#crossa_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#crossa_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#crossa_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#crossa_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @cross_pipeline_dep_in_a0(
+      %lb: i32, %ub: i32, %step: i32,
+      %acc: tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #crossa_blocked>,
+      %gptr: !tt.ptr<f32>,
+      %dst: tensor<256x16x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #crossa_shared, #crossa_smem, mutable>
+    %v0 = arith.constant 0.0 : f32
+
+    // Loop A: stage 0 reads %smem (threaded through iter_args so the
+    // local_load survives DCE), stage 1 no LDS.
+    %final = scf.for %i = %lb to %ub step %step
+        iter_args(%cur = %acc)
+        -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>> : i32 {
+      %ld = scf.execute_region -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>> no_inline {
+        %sub = ttg.memdesc_subslice %smem[0, 0] : !ttg.memdesc<256x64xf16, #crossa_shared, #crossa_smem, mutable> -> !ttg.memdesc<256x16xf16, #crossa_shared, #crossa_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #crossa_shared, #crossa_smem, mutable, 256x64> -> tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>
+        scf.yield %v : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>
+      } {triton.warp_pipeline.stage = "a_load"}
+
+      scf.execute_region no_inline {
+        tt.store %gptr, %v0 : !tt.ptr<f32>
+        scf.yield
+      } {triton.warp_pipeline.stage = "a_compute"}
+
+      scf.yield %ld : tensor<256x16xf16, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>
+    } {triton.warp_pipeline.pipelined_for}
+
+    // Flat B: b_0 writes %smem (the dep), b_1 no LDS.
+    scf.execute_region no_inline {
+      %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #crossa_blocked>
+      ttg.local_store %data, %smem : tensor<256x64xf16, #crossa_blocked> -> !ttg.memdesc<256x64xf16, #crossa_shared, #crossa_smem, mutable>
+      scf.yield
+    } {triton.warp_pipeline.stage = "b_lds"}
+
+    scf.execute_region no_inline {
+      tt.store %gptr, %v0 : !tt.ptr<f32>
+      scf.yield
+    } {triton.warp_pipeline.stage = "b_nolds"}
+
+    tt.store %dst, %final : tensor<256x16x!tt.ptr<f16>, #ttg.dot_op<{opIdx = 0, parent = #crossa_mma, kWidth = 4}>>
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #crossa_shared, #crossa_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @cross_pipeline_dep_in_a0
+// Pre-barrier and phase shift for loop A.
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// CHECK: scf.for
+// Loop body: a_0 (local_load), internal s_barrier, a_1 (tt.store).
+// CHECK: ttg.local_load
+// CHECK: rocdl.s.barrier
+// CHECK: tt.store
+// Boundary barriers between loop A and flat B must be KEPT.  The (a_0, b_0)
+// WAR on %smem at merged distance K places at slot K-1 (inside A); the
+// cross-pipeline-pair sweep finds no LOCAL slot in (0, K-1] (loopBars[1] is
+// false because A's intra-cluster barrier is just s_barrier) and reports
+// the pair as uncovered.
+// CHECK: amdg.cond_barrier
+// CHECK: ttg.barrier local
+// CHECK: amdg.cond_barrier
+// Flat B stages: b_0 (local_store), internal s_barrier, b_1 (tt.store).
+// CHECK: ttg.local_store
+// CHECK: tt.store
+// Reconverge cond_barrier for flat B.
+// CHECK: amdg.cond_barrier
+// CHECK: tt.return
+
+// -----
+
+// ---- LDS effect nested inside scf.if must be detected ----
+//
+// Stage 0 wraps its ttg.local_store inside an scf.if, so the effect is not
+// visible on the top-level op.  buildBlockInfoFromBlock must walk
+// recursively to discover it; otherwise the cross-cluster RAW (stage0
+// writes, stage1 reads) is missed and the cluster barriers degrade from
+// ttg.barrier local to plain rocdl.s.barrier — leaving the LDS race
+// uncovered.
+
+#nest_blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [8, 1], order = [1, 0]}>
+#nest_mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [2, 4], instrShape = [16, 16, 16], isTransposed = true}>
+#nest_dot = #ttg.dot_op<{opIdx = 0, parent = #nest_mma, kWidth = 4}>
+#nest_shared = #ttg.swizzled_shared<{vec = 4, perPhase = 1, maxPhase = 16, order = [1, 0]}>
+#nest_smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @nested_lds_effect_in_if(
+      %lb: i32, %ub: i32, %step: i32,
+      %cond: i1,
+      %acc: tensor<256x16xf16, #nest_dot>,
+      %ptr: tensor<256x64x!tt.ptr<f16>, #nest_blocked>) {
+
+    %smem = ttg.local_alloc : () -> !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>
+
+    %r:2 = scf.for %i = %lb to %ub step %step
+        iter_args(%a = %acc, %s = %smem)
+        -> (tensor<256x16xf16, #nest_dot>, !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>) : i32 {
+
+      // Stage 0: conditionally writes LDS via scf.if.  The ttg.local_store
+      // sits inside the if body, so a flat scan of the cluster body would
+      // miss it.
+      %st = scf.execute_region -> !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable> no_inline {
+        scf.if %cond {
+          %data = tt.load %ptr : tensor<256x64x!tt.ptr<f16>, #nest_blocked>
+          ttg.local_store %data, %s : tensor<256x64xf16, #nest_blocked> -> !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>
+        }
+        scf.yield %s : !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>
+      } {triton.warp_pipeline.stage = "cond_store"}
+
+      // Stage 1: reads LDS — RAW with the conditional write in stage 0.
+      %ld = scf.execute_region -> tensor<256x16xf16, #nest_dot> no_inline {
+        %sub = ttg.memdesc_subslice %s[0, 0] : !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable> -> !ttg.memdesc<256x16xf16, #nest_shared, #nest_smem, mutable, 256x64>
+        %v = ttg.local_load %sub : !ttg.memdesc<256x16xf16, #nest_shared, #nest_smem, mutable, 256x64> -> tensor<256x16xf16, #nest_dot>
+        scf.yield %v : tensor<256x16xf16, #nest_dot>
+      } {triton.warp_pipeline.stage = "lds_load"}
+
+      scf.yield %ld, %s : tensor<256x16xf16, #nest_dot>, !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>
+    } {triton.warp_pipeline.pipelined_for}
+
+    ttg.local_dealloc %smem : !ttg.memdesc<256x64xf16, #nest_shared, #nest_smem, mutable>
+    tt.return
+  }
+}
+
+// CHECK-LABEL: tt.func @nested_lds_effect_in_if
+// CHECK: scf.for
+// Stage 0 with the nested scf.if + local_store.
+// CHECK: scf.if
+// CHECK:   ttg.local_store
+// Cluster barrier between stage 0 and stage 1 is LOCAL (nested write seen).
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: ttg.barrier local
+// CHECK-NEXT: rocdl.sched.barrier
+// Stage 1 reads LDS.
+// CHECK: ttg.local_load
+// Wrap-around barrier is also LOCAL (stage1 read vs stage0 write next iter).
+// CHECK: rocdl.sched.barrier
+// CHECK-NEXT: ttg.barrier local
+// CHECK-NEXT: rocdl.sched.barrier
+// CHECK: scf.yield
diff --git a/test/TritonGPU/amd/amd-warp-pipeline-invalid.mlir b/test/TritonGPU/amd/amd-warp-pipeline-invalid.mlir
new file mode 100644
index 000000000000..65c4bdd00512
--- /dev/null
+++ b/test/TritonGPU/amd/amd-warp-pipeline-invalid.mlir
@@ -0,0 +1,117 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-warp-pipeline -verify-diagnostics
+
+// Loops are not allowed inside a warp_pipeline_stage region; see isLoopOp
+// in WarpPipeliner.cpp for the rationale (no scheduling benefit, opaque to
+// MemoryEffectOpInterface, also covers the "no nested warp pipelines"
+// rule).  Both the loop-form (createPipeline) and flat-form
+// (createFlatPipeline) must reject loops between borders.
+
+// ---- Loop-form: scf.for inside a stage ----
+
+tt.func @loop_form_for_in_cluster(%n: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    %a = arith.addi %i, %c1 : index
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+
+    // expected-error @+1 {{loop op cannot appear inside a warp_pipeline_stage region}}
+    scf.for %j = %c0 to %n step %c1 {
+      scf.yield
+    }
+
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+    %b = arith.addi %a, %i : index
+
+    scf.yield
+  }
+
+  tt.return
+}
+
+// -----
+
+// ---- Loop-form: scf.while inside a stage ----
+
+tt.func @loop_form_while_in_cluster(%n: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    %a = arith.addi %i, %c1 : index
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+
+    // expected-error @+1 {{loop op cannot appear inside a warp_pipeline_stage region}}
+    scf.while (%w = %c0) : (index) -> index {
+      %cond = arith.cmpi slt, %w, %n : index
+      scf.condition(%cond) %w : index
+    } do {
+    ^bb0(%w: index):
+      %wn = arith.addi %w, %c1 : index
+      scf.yield %wn : index
+    }
+
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+    %b = arith.addi %a, %i : index
+
+    scf.yield
+  }
+
+  tt.return
+}
+
+// -----
+
+// ---- Loop-form: nested warp-pipelined scf.for is still a loop ----
+//
+// Even an already-pipelined inner loop is rejected: nesting warp pipelines
+// is a hard constraint, and the loop-op check enforces it for free.
+
+tt.func @loop_form_nested_pipelined_for(%n: index) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  scf.for %i = %c0 to %n step %c1 {
+    %a = arith.addi %i, %c1 : index
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+
+    // expected-error @+1 {{loop op cannot appear inside a warp_pipeline_stage region}}
+    scf.for %j = %c0 to %n step %c1 {
+      scf.execute_region {
+        scf.yield
+      } {triton.warp_pipeline.stage = "inner"}
+      scf.yield
+    } {triton.warp_pipeline.pipelined_for}
+
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage"}
+    %b = arith.addi %a, %i : index
+
+    scf.yield
+  }
+
+  tt.return
+}
+
+// -----
+
+// ---- Flat-form: scf.for between flat borders ----
+
+tt.func @flat_form_for_in_cluster(%n: index, %ptr: !tt.ptr<f32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %v0 = arith.constant 0.0 : f32
+
+  tt.store %ptr, %v0 : !tt.ptr<f32>
+  rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage0"}
+
+  // expected-error @+1 {{loop op cannot appear inside a warp_pipeline_stage region}}
+  scf.for %j = %c0 to %n step %c1 {
+    scf.yield
+  }
+
+  rocdl.sched.barrier 0 {triton.warp_pipeline.border = "stage1"}
+  tt.store %ptr, %v0 : !tt.ptr<f32>
+
+  tt.return
+}
diff --git a/test/TritonGPU/amd/amd-warp-pipeline.mlir b/test/TritonGPU/amd/amd-warp-pipeline.mlir
index fee2da41f878..55f390591c2e 100644
--- a/test/TritonGPU/amd/amd-warp-pipeline.mlir
+++ b/test/TritonGPU/amd/amd-warp-pipeline.mlir
@@ -145,18 +145,25 @@ tt.func public @triple_buf_two_stages(%arg0: i32, %arg1: i32, %arg2: i32, %arg3:
 
 // -- Flat (unrolled) pipeline: borders outside scf.for ----
 //
-// Simulates a static_range epilogue that was unrolled at the Python level.
-// The border markers sit in the function body, not inside a loop.
+// Simulates a static_range epilogue that was unrolled at the Python level
+// following a regular pipelined main loop.  The flat backward walk must stop
+// at the prior scf.for (loops are disallowed inside a stage) so the main
+// loop is not absorbed into stage 0.
 
 tt.func @flat_pipeline_example(%n: index) {
   %c0  = arith.constant 0 : index
   %c1  = arith.constant 1 : index
 
+  // Pipelined main loop: gets the pipelined_for attribute and acts as a
+  // hard boundary for the flat epilogue's backward walk.
   scf.for %i = %c0 to %n step %c1 {
+    %x = arith.addi %i, %c1 : index
+    rocdl.sched.barrier 0 {triton.warp_pipeline.border = "load"}
+    %y = arith.muli %x, %c1 : index
     scf.yield
   }
 
-  // Stage 0 (ops before the first border)
+  // Stage 0 (ops before the first epilogue border)
   %a  = arith.addi %c0, %c1 : index
   %a2 = arith.muli %a, %c1 : index
 
@@ -172,8 +179,14 @@ tt.func @flat_pipeline_example(%n: index) {
 }
 
 // CHECK-LABEL: tt.func @flat_pipeline_example(
+// Pipelined main loop forms its own warp pipeline (one execute_region per
+// stage, then the pipelined_for attribute on the loop).
 // CHECK: scf.for
-// Flat execute_regions created from the borders:
+// CHECK:   scf.execute_region
+// CHECK:   scf.execute_region
+// CHECK: triton.warp_pipeline.pipelined_for
+// Flat epilogue execute_regions created from the borders.  Crucially, they
+// must NOT absorb the pipelined main loop above.
 // CHECK: scf.execute_region
 // CHECK:   arith.addi
 // CHECK:   arith.muli
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index a1ce910b3559..5ed99da39dae 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -52,31 +52,33 @@ namespace mlir::triton {
 
 namespace {
 
-// construct a virtual block from each pipeline cluster
-// block contains its buffer R/W information.
+// Construct a virtual block describing a pipeline cluster's buffer R/W set.
+// Walks recursively so that LDS effects inside nested non-loop regions
+// (scf.if / tt.reduce / tt.scan / etc.) are accounted for.  Loops (scf.for /
+// scf.while) cannot legally appear inside a cluster, so this walk never has
+// to reason about iteration-multiplied effects.
 static BlockInfo buildBlockInfoFromBlock(Block *block, Allocation *allocation) {
-  BlockInfo info; // running fact for this block
-  for (Operation &opRef : *block) {
-    Operation *op = &opRef;
-    if (auto mei = dyn_cast<MemoryEffectOpInterface>(op)) {
-      SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>> effs;
-      mei.getEffects(effs);
-      for (auto &eff : effs) {
-        if (Value v = eff.getValue()) {
-          for (auto bufId : allocation->getAllBufferIdsWithAliases(v)) {
-            if (bufId == Allocation::InvalidBufferId)
-              continue;
-            auto interval = allocation->getAllocatedInterval(bufId);
-            auto slice = AllocationSlice(v, interval, bufId);
-            if (isa<MemoryEffects::Write>(eff.getEffect()))
-              info.syncWriteSlices[slice].insert(op);
-            else if (isa<MemoryEffects::Read>(eff.getEffect()))
-              info.syncReadSlices[slice].insert(op);
-          }
-        }
+  BlockInfo info;
+  block->walk([&](MemoryEffectOpInterface mei) {
+    Operation *op = mei.getOperation();
+    SmallVector<SideEffects::EffectInstance<MemoryEffects::Effect>> effs;
+    mei.getEffects(effs);
+    for (auto &eff : effs) {
+      Value v = eff.getValue();
+      if (!v)
+        continue;
+      for (auto bufId : allocation->getAllBufferIdsWithAliases(v)) {
+        if (bufId == Allocation::InvalidBufferId)
+          continue;
+        auto interval = allocation->getAllocatedInterval(bufId);
+        auto slice = AllocationSlice(v, interval, bufId);
+        if (isa<MemoryEffects::Write>(eff.getEffect()))
+          info.syncWriteSlices[slice].insert(op);
+        else if (isa<MemoryEffects::Read>(eff.getEffect()))
+          info.syncReadSlices[slice].insert(op);
       }
     }
-  }
+  });
   return info;
 }
 
@@ -102,6 +104,43 @@ static scf::ExecuteRegionOp getPipelineStage(Operation *op) {
   return isPipelineStage(exec) ? exec : nullptr;
 }
 
+// Validate the body of a `pipelined_for` loop.  After WarpPipeliner the body
+// must consist of: a sequence of pipeline-stage execute_regions, optional
+// pre-existing barrier/wait ops between (or before/after) those stages, and
+// a terminator scf.yield -- nothing else.  Emits an error and returns
+// failure on any deviation.  Side-effect free: leaves the IR untouched so
+// callers can fail fast before mutating anything.
+static LogicalResult validatePipelinedForBody(scf::ForOp forOp) {
+  std::map<int, Operation *> existingBarrierMap;
+  int numClusters = 0;
+  bool seenYield = false;
+  for (auto &op : *forOp.getBody()) {
+    if (auto exeOp = dyn_cast<scf::ExecuteRegionOp>(op)) {
+      if (!isPipelineStage(exeOp))
+        return op.emitError(
+            "non-warp-pipeline scf.execute_region inside pipelined_for body");
+      ++numClusters;
+    } else if (isWarpPipelineIgnorableBarrier(&op)) {
+      if (existingBarrierMap.count(numClusters))
+        return op.emitError("multiple pre-existing barriers between pipeline "
+                            "stages; insert a dummy stage instead");
+      existingBarrierMap[numClusters] = &op;
+    } else if (isa<scf::YieldOp>(op)) {
+      seenYield = true;
+    } else {
+      return op.emitError("unexpected op inside pipelined_for body; only "
+                          "warp-pipeline stages and barrier/wait ops are "
+                          "allowed");
+    }
+  }
+  if (!seenYield)
+    return forOp.emitError("pipelined_for body has no scf.yield terminator");
+  if (existingBarrierMap.count(0) && existingBarrierMap.count(numClusters))
+    return forOp.emitError("pipelined_for body has both top-of-loop and "
+                           "bottom-of-loop pre-existing barriers");
+  return success();
+}
+
 // Pairwise LDS-dependency analysis between pipeline clusters.
 //
 // `circular` selects the index topology used by the analysis:
@@ -218,6 +257,21 @@ static void emitClusterPriority(OpBuilder &r, Location loc,
   }
 }
 
+// Wrap a pre-existing barrier op (e.g. async_wait) with sched_barriers so the
+// backend scheduler cannot move ops across it, and emit the cluster's
+// priority just before the barrier.  Used in place of inserting a fresh
+// cluster barrier when one already exists at the cluster boundary.
+static void wrapExistingBarrier(OpBuilder &b, Location loc,
+                                Operation *clusterOp,
+                                Operation *existingBarrier,
+                                bool anyHasPriority) {
+  b.setInsertionPoint(existingBarrier);
+  emitClusterPriority(b, loc, clusterOp, anyHasPriority);
+  ROCDL::SchedBarrier::create(b, loc, 0);
+  b.setInsertionPointAfter(existingBarrier);
+  ROCDL::SchedBarrier::create(b, loc, 0);
+}
+
 // Emit pre-barrier, thread-ID partitioning, and phase-shift cond_barrier.
 // Returns warpLow (for reconverge) and warpHigh (consumed by phase shift).
 static std::pair<Value, Value>
@@ -264,25 +318,25 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     // Only handle loops that the frontend marked with pipelined_for.
     if (!forOp->getAttr("triton.warp_pipeline.pipelined_for"))
       return rewriter.notifyMatchFailure(forOp, "no pipelined_for");
-    forOp->removeAttr("triton.warp_pipeline.pipelined_for");
 
-    // Look up allocation info as in original pass.
+    // Look up allocation info as in original pass.  Bail out *before* we
+    // mutate the IR so a soft match-failure cannot leave a half-converted
+    // loop behind (no marker, no barriers).
     auto func = forOp->getParentOfType<mlir::triton::FuncOp>();
     Allocation *allocation = moduleAllocation.getFuncData(func);
     if (!allocation)
       return rewriter.notifyMatchFailure(forOp, "no Allocation for function");
 
-    if (failed(emitPipelinedFor(rewriter, forOp.getLoc(), forOp, allocation,
-                                threadsPerPipelineGroup)))
-      return failure();
-
+    forOp->removeAttr("triton.warp_pipeline.pipelined_for");
+    emitPipelinedFor(rewriter, forOp.getLoc(), forOp, allocation,
+                     threadsPerPipelineGroup);
     return success();
   }
 
 private:
-  LogicalResult emitPipelinedFor(PatternRewriter &b, Location loc,
-                                 scf::ForOp forOp, Allocation *allocation,
-                                 int threadsPerPipelineGroup) const {
+  void emitPipelinedFor(PatternRewriter &b, Location loc, scf::ForOp forOp,
+                        Allocation *allocation,
+                        int threadsPerPipelineGroup) const {
     // 1. Pre-barrier, thread partitioning, and phase shift.
     b.setInsertionPoint(forOp);
     auto [warpLow, warpHigh] =
@@ -300,30 +354,18 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     std::map<int, Operation *> existingBarrierMap;
     Operation *terminatorOp = nullptr;
 
+    // Body shape was already validated by validatePipelinedForBody before
+    // the pattern ran, so we trust the structure here.
     for (auto &op : *forOp.getBody()) {
       if (auto exeOp = dyn_cast<scf::ExecuteRegionOp>(op)) {
-        // Fail conversion with executeRegion from unkown source.
-        if (!isPipelineStage(exeOp))
-          return failure();
         exeOp.setNoInline(false);
         clusterOps.push_back(&op);
         clusterBlocks.push_back(&exeOp->getRegion(0).front());
         bars.push_back(false);
       } else if (isWarpPipelineIgnorableBarrier(&op)) {
-        int currCluster = clusterBlocks.size();
-        // Reject if multiple barriers appear without an intervening cluster.
-        // This is functionally valid but may cause unpredictable timing. Users
-        // should insert a dummy cluster explicitly if a pipeline bubble is
-        // required.
-        // Also only allow ops which waits local memory,
-        // e.g., s_barrier is NOT allowed.
-        if (existingBarrierMap.find(currCluster) != existingBarrierMap.end())
-          return failure();
-        existingBarrierMap[currCluster] = &op;
-      } else if (auto yieldOp = dyn_cast<scf::YieldOp>(op)) {
+        existingBarrierMap[clusterBlocks.size()] = &op;
+      } else if (isa<scf::YieldOp>(op)) {
         terminatorOp = &op;
-      } else { // Fail conversion if any other op found outside of the cluster.
-        return failure();
       }
     }
 
@@ -343,8 +385,10 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     auto topBar = existingBarrierMap.find(0);
     auto bottomBar = existingBarrierMap.find(numClusters);
     if (bottomBar != existingBarrierMap.end()) {
-      if (topBar != existingBarrierMap.end())
-        return failure(); // Unreachable
+      // validatePipelinedForBody guarantees we cannot have both top and
+      // bottom barriers, so rotating bottom -> 0 is unambiguous.
+      assert(topBar == existingBarrierMap.end() &&
+             "validatePipelinedForBody should have rejected this");
       existingBarrierMap[0] = bottomBar->second;
       existingBarrierMap.erase(bottomBar);
     }
@@ -367,12 +411,8 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     for (int i = 0; i < numClusters; i++) {
       if (auto exBar = existingBarrierMap.find(i);
           exBar != existingBarrierMap.end()) {
-        auto exBarOp = exBar->second;
-        b.setInsertionPoint(exBarOp);
-        emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
-        ROCDL::SchedBarrier::create(b, loc, 0);
-        b.setInsertionPointAfter(exBarOp);
-        ROCDL::SchedBarrier::create(b, loc, 0);
+        wrapExistingBarrier(b, loc, clusterOps[i], exBar->second,
+                            anyHasPriority);
       } else {
         b.setInsertionPoint(clusterOps[i]);
         // The first one wraps back to the last of the loop
@@ -391,7 +431,6 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     // 5. Post-loop priority reset and reconverge.
     b.setInsertionPointAfter(forOp);
     emitPipelinePostlude(b, loc, anyHasPriority, warpLow);
-    return success();
   }
 
   ModuleAllocation &moduleAllocation;
@@ -503,11 +542,8 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
     }
 
     if (existingBarrier) {
-      b.setInsertionPoint(existingBarrier);
-      emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
-      ROCDL::SchedBarrier::create(b, loc, 0);
-      b.setInsertionPointAfter(existingBarrier);
-      ROCDL::SchedBarrier::create(b, loc, 0);
+      wrapExistingBarrier(b, loc, clusterOps[i], existingBarrier,
+                          anyHasPriority);
     } else {
       b.setInsertionPoint(clusterOps[i]);
       emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
@@ -531,6 +567,10 @@ static void processUnrolledPipelineRegions(ModuleOp m,
     if (!allocation)
       return;
 
+    // NOTE: We only iterate the function's top-level blocks; flat-pipeline
+    // execute_regions inside nested non-loop regions (e.g. scf.if bodies)
+    // are not collected.  WarpPipeliner's flat-pipeline frontend has the
+    // same scope, so the two stay in sync.
     for (Block &block : funcOp.getBody()) {
       // Collect contiguous sequences of flat warp-pipeline execute_regions,
       // splitting at any non-ignorable, non-pipeline op.
@@ -539,10 +579,8 @@ static void processUnrolledPipelineRegions(ModuleOp m,
 
       for (auto &op : block) {
         if (auto exec = getPipelineStage(&op)) {
-          if (!isa<scf::ForOp>(exec->getParentOp())) {
-            current.push_back(exec);
-            continue;
-          }
+          current.push_back(exec);
+          continue;
         }
         if (isWarpPipelineIgnorableBarrier(&op))
           continue;
@@ -675,21 +713,40 @@ static bool collectNextPipelineClusters(Operation *startOp,
   return false;
 }
 
-// Check whether merging two pipelines creates a cross-pipeline LDS dependency
-// at the boundary.  Concatenates the cluster infos and barrier flags from both
-// pipelines and runs analyzePipelineDependencies in linear mode on the merged
-// sequence.
+// Check whether merging two pipelines is safe to do without inserting a
+// barrier at the boundary.  Enumerates *only* cross-pipeline pairs
+// (a_i, b_j) and verifies each intersected pair already has a LOCAL barrier
+// on its path in the merged schedule.
+//
+// Why not reuse analyzePipelineDependencies on the merged sequence?
+//   The merged linear analysis would also visit intra-A and intra-B pairs,
+//   and may try to flip an internal slot (e.g. between a_0 and a_1) when
+//   A's IR has only a non-LOCAL pre-existing barrier (such as
+//   amdg.async_tdm_wait).  Such slots are A's own responsibility — A
+//   already accepted that wait as sufficient for intra-warp ordering — and
+//   re-flipping them is a false positive that prevents elimination on
+//   otherwise safe kernels.  Restricting the sweep to cross-pipeline pairs
+//   sidesteps the ambiguity entirely.
 //
-// Note on concurrency vs memory ordering: with a one-stage phase offset the
-// only cross-warp concurrent pair at the boundary is (a_{K-1}, b_0); all
-// other pairs execute sequentially within the same warp.  However, within a
-// warp LDS write→read ordering still requires a LOCAL barrier (ds_wait)
-// between producer and consumer, so the merged analysis must check every
-// (a_i, b_j) pair, not just the concurrent one.  The single-distance sweep
-// inside analyzePipelineDependencies covers both cases uniformly.
+// Cross-warp concurrency vs intra-warp ordering:
+//   With a one-stage phase offset the only truly concurrent cross-warp
+//   pair at the boundary is (a_{K-1}, b_0).  All other (a_i, b_j) pairs
+//   execute sequentially within a single warp.  Both kinds, however,
+//   require a LOCAL barrier on AMD: the concurrent pair needs cross-warp
+//   sync, and the sequential pair needs ds_wait to order async ds_read /
+//   ds_write within a warp (pre-existing async_tdm_wait does *not*
+//   guarantee ds completion ordering in general).  So the coverage check
+//   uses LOCAL-only mergedBars uniformly across all pairs.
 //
-// Returns true if the boundary position stays dependency-free after analysis
-// (i.e. safe to eliminate).
+// Layout of mergedBars (linear, LOCAL-only):
+//   i < K      A's internal LOCAL barriers (loopBars[i]).
+//   i == K     boundary seed; set to loopBars[0] because A's wrap-around
+//              physically sits at the bottom of A's loop body and, when
+//              LOCAL, is the most recent LDS sync the merged schedule
+//              inherits as it crosses into B.
+//   i > K      B's internal LOCAL barriers (nextBars[i - K]).  nextBars[0]
+//              is skipped (flat B has no slot before b_0; loop B's
+//              wrap-around lives inside B's body, irrelevant here).
 static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
                                 ArrayRef<bool> loopBars,
                                 ArrayRef<Block *> nextBlocks,
@@ -704,31 +761,38 @@ static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
   for (auto *b : nextBlocks)
     mergedInfo.push_back(buildBlockInfoFromBlock(b, allocation));
 
-  // Merged layout: [a_0..a_{K-1}, b_0..b_{M-1}]
-  // mergedBars[i] = LOCAL barrier immediately before cluster i.
-  //   i < K     : A's internal barriers (loopBars[i]).  loopBars[0]
-  //               corresponds to A's wrap-around inside the loop body and is
-  //               never consulted in linear mode (analyzePipelineDependencies
-  //               only reads bars[idx] for idx > src ≥ 0).
-  //   i == K    : boundary — initialized false; this is what we decide.
-  //   i > K     : B's internal barriers (nextBars[i - K]).  nextBars[0] is
-  //               skipped: for flat B it is always false, for loop B it is
-  //               B's own wrap-around (inside B's loop body) which is
-  //               covered by B's own circular analysis.
   SmallVector<bool> mergedBars;
   mergedBars.reserve(K + M);
   for (bool b : loopBars)
     mergedBars.push_back(b);
-  mergedBars.push_back(false); // boundary
+  mergedBars.push_back(loopBars[0]); // boundary, seeded by A's wrap-around
   for (int i = 1; i < M; i++)
     mergedBars.push_back(nextBars[i]);
 
-  analyzePipelineDependencies(mergedInfo, mergedBars, allocation,
-                              /*circular=*/false);
-
-  if (mergedBars[K]) {
-    LDBG("cross-pipeline LDS dependency at boundary");
+  // True if any slot in (src, stop] is LOCAL.  Linear topology, no wrap.
+  auto isCovered = [&](int src, int stop) {
+    for (int i = src + 1; i <= stop; i++)
+      if (mergedBars[i])
+        return true;
     return false;
+  };
+
+  // Sweep cross-pipeline pairs only.  Placement choice mirrors
+  // analyzePipelineDependencies (dist == 1 → dst, dist > 1 → dst - 1).
+  for (int i = 0; i < K; i++) {
+    for (int j = 0; j < M; j++) {
+      int src = i, dst = K + j;
+      int dist = dst - src;
+      int barrierLoc = (dist == 1) ? dst : dst - 1;
+      if (isCovered(src, barrierLoc))
+        continue;
+      if (!mergedInfo[src].isIntersected(
+              mergedInfo[dst], mlir::triton::AMD::membarFilter, allocation))
+        continue;
+      LDBG("cross-pipeline LDS dep (a_"
+           << i << ", b_" << j << ") uncovered at slot " << barrierLoc);
+      return false;
+    }
   }
   return true;
 }
@@ -788,8 +852,10 @@ static void eliminateRedundantCondBarriers(ModuleOp m,
         Operation *prev = postLoopCB->getPrevNode();
         if (prev && isa<ROCDL::SetPrioOp>(prev))
           prev = prev->getPrevNode();
-        if (!isa_and_nonnull<scf::ForOp>(prev))
+        if (!isa_and_nonnull<scf::ForOp>(prev)) {
+          LDBG("post-loop cond_barrier not preceded by scf.for; skipping");
           continue;
+        }
         auto prevFor = cast<scf::ForOp>(prev);
 
         // The pre-loop cond_barrier must be followed by a warp-pipelined
@@ -800,26 +866,36 @@ static void eliminateRedundantCondBarriers(ModuleOp m,
           next = next->getNextNode();
         bool nextIsPipeline =
             isa_and_nonnull<scf::ForOp>(next) || getPipelineStage(next);
-        if (!nextIsPipeline)
+        if (!nextIsPipeline) {
+          LDBG("pre-loop cond_barrier not followed by a warp-pipeline; "
+               "skipping");
           continue;
+        }
 
         // The post-loop cond_barrier must be immediately followed by the
         // prelude's ttg.barrier local — this proves no operations were
         // inserted between the two pipelines.
         auto preBarrier =
             dyn_cast_or_null<triton::gpu::BarrierOp>(postLoopCB->getNextNode());
-        if (!preBarrier || !preBarrier.hasLocal())
+        if (!preBarrier || !preBarrier.hasLocal()) {
+          LDBG("post-loop cond_barrier not immediately followed by prelude "
+               "ttg.barrier local; skipping");
           continue;
+        }
 
         // Cross-pipeline LDS dependency analysis.  When the phase carries
         // over, stages from different pipelines execute concurrently at the
         // boundary.  We must verify that no uncovered LDS conflict exists.
         SmallVector<Block *> loopBlocks, nextBlocks;
         SmallVector<bool> loopBars, nextBars;
-        if (!collectLoopClusters(prevFor, loopBlocks, loopBars))
+        if (!collectLoopClusters(prevFor, loopBlocks, loopBars)) {
+          LDBG("could not collect prior loop's clusters; skipping");
           continue;
-        if (!collectNextPipelineClusters(next, nextBlocks, nextBars))
+        }
+        if (!collectNextPipelineClusters(next, nextBlocks, nextBars)) {
+          LDBG("could not collect next pipeline's clusters; skipping");
           continue;
+        }
         if (!isCrossPipelineSafe(loopBlocks, loopBars, nextBlocks, nextBars,
                                  allocation)) {
           LDBG("cross-pipeline LDS dependency at boundary — keeping barriers");
@@ -866,6 +942,20 @@ struct ConvertWarpPipeline
     // stages at different times.
     int threadsPerPipelineGroup = targetInfo.getWarpSize() * 4;
 
+    // Up-front structural validation: catch malformed pipelined_for bodies
+    // before any rewrite mutates the IR.  Errors are emitted at the
+    // offending op; we bail out hard rather than producing half-converted
+    // IR.
+    bool malformed = false;
+    m.walk([&](scf::ForOp forOp) {
+      if (!forOp->getAttr("triton.warp_pipeline.pipelined_for"))
+        return;
+      if (failed(validatePipelinedForBody(forOp)))
+        malformed = true;
+    });
+    if (malformed)
+      return signalPassFailure();
+
     RewritePatternSet patternFor(&getContext());
     RewritePatternSet patternInline(&getContext());
     patternFor.add<ConvertPipelinedForPattern>(&getContext(), moduleAllocation,
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index a1015c90f5bb..5a500b5f169e 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -28,7 +28,7 @@ namespace mlir {
 // Ops that may appear between pipeline stages but never inside one.  Pre-
 // existing memory-fence/wait ops at cluster boundaries are tolerated so that
 // prefetch patterns continue to work; encountering one mid-cluster is treated
-// as a pattern mismatch by the callers.
+// as malformed input by the callers.
 static bool isPipelineIgnorable(Operation *op) {
   return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
              tt::amdgpu::AsyncTDMWait>(op);
@@ -39,6 +39,33 @@ static bool isPipelineBorder(Operation *op) {
   return op->hasAttr("triton.warp_pipeline.border");
 }
 
+// True if `op` is a structured loop (scf.for / scf.while).  Loops are
+// disallowed inside a pipeline stage for several reasons:
+//   1. No useful effect: a non-pipelined loop runs to completion within a
+//      single warp's phase while the other warp waits at the cluster
+//      boundary; the pipeline cannot interleave anything across iterations.
+//      Iteration-level pipelining is expressed by the loop-pipeline form
+//      (scf.for containing warp_pipeline_stage blocks).
+//   2. A cluster is meant to be a straight-line scheduling unit so the
+//      backend can interleave its ops with the other warp's stage; a loop
+//      is an opaque control-flow region the scheduler cannot see across.
+//   3. Our cross-cluster memory-effect analysis is a flat scan over the
+//      cluster body; loop bodies aren't visible through MemoryEffectOpInterface
+//      and loop-carried LDS hazards have nowhere to be barriered (cluster
+//      barriers are boundary-only).
+// As a side effect this also serves as the "no nested warp pipelines" check,
+// since a warp-pipelined scf.for is still an scf.for.
+static bool isLoopOp(Operation *op) {
+  return isa<scf::ForOp, scf::WhileOp>(op);
+}
+
+// Outcome of attempting to build a pipeline from a region.
+//   NotApplicable: no border markers were present (the region opted out).
+//   Created:       a pipeline was successfully materialized.
+//   Malformed:     border markers were present but the pipeline could not be
+//                  built; an error has been emitted at the offending op.
+enum class PipelineResult { NotApplicable, Created, Malformed };
+
 // Read (cluster-name, priority) from a border marker op.  Priority defaults
 // to -1 when the marker doesn't carry the optional priority attribute.
 static std::pair<StringAttr, int> readBorderMarker(Operation *op) {
@@ -148,11 +175,19 @@ static void createClusterOp(OpBuilder &b, Location loc,
   return;
 }
 
-// Turns a partitioned region into the warp-pipelined clusters
-static LogicalResult createPipeline(OpBuilder &b, Location loc,
-                                    scf::ForOp forOp) {
-  // Collect ops in the loop body
+// Turns a partitioned region into the warp-pipelined clusters.  Returns
+// NotApplicable when the loop has no border markers (user opted out), Created
+// on success, or Malformed when border markers are present but the loop body
+// cannot be split into a valid pipeline (an error is emitted in that case).
+static PipelineResult createPipeline(OpBuilder &b, Location loc,
+                                     scf::ForOp forOp) {
   Block &blk = *forOp.getBody();
+
+  // Opt-in gate: if the loop body has no borders, the user did not request
+  // warp-pipelining for this loop and we must leave it untouched.
+  if (llvm::none_of(blk, [](Operation &op) { return isPipelineBorder(&op); }))
+    return PipelineResult::NotApplicable;
+
   SmallVector<Operation *> cluster;
   SmallVector<std::pair<StringAttr, int>> clusterMarkers;
   SmallVector<SmallVector<Operation *>> clusters;
@@ -170,12 +205,23 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
       continue;
     }
     if (isPipelineIgnorable(op)) {
-      // Ignorable ops may appear before or after a stage, but not inside it.
-      // If encountered while building an execute_region, reject warp-pipeline.
-      if (!cluster.empty())
-        return failure();
+      // Ignorable ops (barrier / async_wait family) belong between stages,
+      // never inside one.  Encountering one while a cluster is being built
+      // means the user inserted it inside a warp_pipeline_stage region.
+      if (!cluster.empty()) {
+        op->emitError("barrier or wait op cannot appear inside a "
+                      "warp_pipeline_stage region");
+        return PipelineResult::Malformed;
+      }
       continue;
     }
+    if (isLoopOp(op)) {
+      // Loops are not permitted inside a stage; see isLoopOp for rationale.
+      op->emitError("loop op cannot appear inside a warp_pipeline_stage "
+                    "region; to pipeline loop iterations, place "
+                    "warp_pipeline_stage blocks inside the loop body");
+      return PipelineResult::Malformed;
+    }
     if (isa<scf::YieldOp>(op)) // End of the loop
       break;
 
@@ -188,9 +234,14 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
     clusterMarkers.push_back({clusterStr, -1});
   }
 
-  // no pipeline clusters detected if 1 or 0 chunk found
-  if (clusters.size() < 2)
-    return failure();
+  // We only reach here when at least one border existed; a single cluster
+  // means the borders are degenerate (e.g. a lone trailing border with no
+  // operations after it).  Treat as malformed user input.
+  if (clusters.size() < 2) {
+    forOp->emitError(
+        "warp_pipeline_stage borders did not produce at least two stages");
+    return PipelineResult::Malformed;
+  }
 
   // Materialize each cluster as an execute_region.
   int totalStages = clusters.size();
@@ -205,7 +256,7 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
   forOp->setAttr("triton.warp_pipeline.pipelined_for", b.getUnitAttr());
 
   LDBG("[warp-pipeline] total_stages=" << totalStages << "\n");
-  return success();
+  return PipelineResult::Created;
 }
 
 // Create a pipelined region from flat (non-loop) border markers in a block.
@@ -213,16 +264,30 @@ static LogicalResult createPipeline(OpBuilder &b, Location loc,
 // (e.g. via static_range) but the body still has warp_pipeline_stage
 // annotations producing border markers.  The grouping logic mirrors
 // createPipeline but without a loop wrapper.
-static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
-  // 1. Find all border markers in this block.  Need at least two to form
-  //    a pipeline (one per stage boundary).
+//
+// Returns NotApplicable when the block has no border markers, Created when
+// a flat pipeline was materialized, or Malformed when borders are present
+// but a valid pipeline could not be built (an error is emitted in that case).
+static PipelineResult createFlatPipeline(OpBuilder &b, Block &block) {
+  // 1. Find all border markers in this block.
   SmallVector<Operation *> allBorders;
   for (auto &op : block)
     if (isPipelineBorder(&op))
       allBorders.push_back(&op);
 
-  if (allBorders.size() < 2)
-    return failure();
+  // No borders at all means the block did not opt into flat pipelining.
+  if (allBorders.empty())
+    return PipelineResult::NotApplicable;
+
+  // A single border cannot form a 2-stage pipeline; treat as malformed input
+  // since the user did opt in (the lone border would otherwise leak through
+  // unprocessed).
+  if (allBorders.size() < 2) {
+    allBorders.front()->emitError(
+        "warp_pipeline_stage requires at least two borders to form a flat "
+        "pipeline");
+    return PipelineResult::Malformed;
+  }
 
   Location loc = allBorders.front()->getLoc();
   Operation *firstBorder = allBorders.front();
@@ -230,13 +295,20 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
 
   // 2. Locate the start of the first stage.  Unlike createPipeline, the flat
   //    sequence has no loop body to anchor against, so walk backwards from
-  //    the first border, stopping at control-flow boundaries (scf.for,
-  //    cond_barrier) or ignorable ops belonging to a previous pipeline.
+  //    the first border, stopping at things that must not be folded into
+  //    stage 0:
+  //      - a structured loop (scf.for / scf.while) -- see isLoopOp for
+  //        rationale; this also covers already-warp-pipelined loops, so the
+  //        "no nesting" rule falls out for free
+  //      - a phase-control op (defensive; not produced before this pass)
+  //      - an inter-stage barrier from a previous pipeline
+  //    Other structured control flow (scf.if, etc.) is absorbed into stage 0
+  //    since execution falls through it linearly at this stage of the
+  //    pipeline.
   Operation *regionStart = firstBorder;
   for (Operation *op = firstBorder->getPrevNode(); op; op = op->getPrevNode()) {
-    if (isa<scf::ForOp>(op) || isa<tt::amdgpu::CondBarrierOp>(op))
-      break;
-    if (isPipelineIgnorable(op))
+    if (isLoopOp(op) || isa<tt::amdgpu::CondBarrierOp>(op) ||
+        isPipelineIgnorable(op))
       break;
     regionStart = op;
   }
@@ -266,18 +338,36 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
     }
 
     if (isPipelineIgnorable(op)) {
-      if (!cluster.empty())
-        return failure();
+      // Same rule as createPipeline: barriers/waits cannot live inside a
+      // stage.
+      if (!cluster.empty()) {
+        op->emitError("barrier or wait op cannot appear inside a "
+                      "warp_pipeline_stage region");
+        return PipelineResult::Malformed;
+      }
       continue;
     }
 
+    if (isLoopOp(op)) {
+      // Same rule as createPipeline: loops cannot live inside a stage.
+      op->emitError("loop op cannot appear inside a warp_pipeline_stage "
+                    "region; to pipeline loop iterations, place "
+                    "warp_pipeline_stage blocks inside the loop body");
+      return PipelineResult::Malformed;
+    }
+
     cluster.push_back(op);
   }
 
-  // 4. Materialize each cluster as an execute_region.  Bail out if fewer than
-  //    two real clusters survived (e.g., dummies-only).
-  if (clusters.size() < 2)
-    return failure();
+  // 4. Materialize each cluster as an execute_region.  With at least two
+  //    borders the sweep should always produce >= 2 clusters; treat anything
+  //    less as a defensive malformed case.  Note: the borders themselves
+  //    were erased during the sweep, so we attach the diagnostic to `loc`.
+  if (clusters.size() < 2) {
+    mlir::emitError(
+        loc, "warp_pipeline_stage borders did not produce at least two stages");
+    return PipelineResult::Malformed;
+  }
 
   for (auto &&[stageOps, marker] : llvm::zip(clusters, clusterMarkers)) {
     if (stageOps.empty())
@@ -286,7 +376,7 @@ static LogicalResult createFlatPipeline(OpBuilder &b, Block &block) {
   }
 
   LDBG("[warp-pipeline] flat pipeline with " << clusters.size() << " stages");
-  return success();
+  return PipelineResult::Created;
 }
 
 struct TritonAMDGPUWarpPipelinePass
@@ -296,19 +386,39 @@ struct TritonAMDGPUWarpPipelinePass
   void runOnOperation() override {
     ModuleOp m = getOperation();
     OpBuilder builder(m);
+    bool malformed = false;
     for (auto funcOp : m.getOps<tt::FuncOp>()) {
       funcOp.walk([&](scf::ForOp forOp) {
         Location loc = forOp.getLoc();
-        if (createPipeline(builder, loc, forOp).failed())
-          LDBG("Failed warp-pipelining");
+        switch (createPipeline(builder, loc, forOp)) {
+        case PipelineResult::NotApplicable:
+          LDBG("scf.for has no warp_pipeline_stage borders; skipping");
+          break;
+        case PipelineResult::Created:
+          break;
+        case PipelineResult::Malformed:
+          malformed = true;
+          break;
+        }
       });
 
-      // Process remaining border markers in flat (non-loop) code.
+      // Process remaining border markers in flat (non-loop) code.  Only the
+      // function's top-level blocks are visited; borders inside nested
+      // non-loop regions (e.g. scf.if bodies) are not handled here.
       for (Block &block : funcOp.getBody()) {
-        if (createFlatPipeline(builder, block).failed())
-          LDBG("No flat warp-pipeline in block");
+        switch (createFlatPipeline(builder, block)) {
+        case PipelineResult::NotApplicable:
+          break;
+        case PipelineResult::Created:
+          break;
+        case PipelineResult::Malformed:
+          malformed = true;
+          break;
+        }
       }
     }
+    if (malformed)
+      signalPassFailure();
   }
 };
 

From 5aceca52624ab29cb4c02ba98545459be53e7c1f Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Mon, 27 Apr 2026 18:31:14 +0000
Subject: [PATCH 17/19] fix test

---
 test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir b/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
index 81db83cd436b..532cfcd00f23 100644
--- a/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
+++ b/test/TritonGPU/amd/amd-convert-warp-pipeline-invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: triton-opt %s -split-input-file -convert-warp-pipeline="arch=gfx950" -verify-diagnostics
+// RUN: triton-opt %s -split-input-file -convert-warp-pipeline="gfx-arch=gfx950" -verify-diagnostics
 
 // validatePipelinedForBody runs upfront, before any IR mutation, so a
 // malformed `pipelined_for` body fails the pass with no partial conversion.

From cdbcad790f4736d30e7b3c340d2659f7c7745816 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Tue, 28 Apr 2026 22:43:29 +0000
Subject: [PATCH 18/19] last few fixes

---
 .../ConvertWarpPipeline.cpp                   | 60 +++++++++++--------
 .../gluon/f16_gemm_warp_pipeline_gfx1250.py   |  8 +--
 2 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
index 0a2210b7309c..4669efd83c75 100644
--- a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
+++ b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertWarpPipeline.cpp
@@ -113,7 +113,6 @@ static scf::ExecuteRegionOp getPipelineStage(Operation *op) {
 static LogicalResult validatePipelinedForBody(scf::ForOp forOp) {
   std::map<int, Operation *> existingBarrierMap;
   int numClusters = 0;
-  bool seenYield = false;
   for (auto &op : *forOp.getBody()) {
     if (auto exeOp = dyn_cast<scf::ExecuteRegionOp>(op)) {
       if (!isPipelineStage(exeOp))
@@ -126,15 +125,16 @@ static LogicalResult validatePipelinedForBody(scf::ForOp forOp) {
                             "stages; insert a dummy stage instead");
       existingBarrierMap[numClusters] = &op;
     } else if (isa<scf::YieldOp>(op)) {
-      seenYield = true;
+      continue;
     } else {
       return op.emitError("unexpected op inside pipelined_for body; only "
                           "warp-pipeline stages and barrier/wait ops are "
                           "allowed");
     }
   }
-  if (!seenYield)
-    return forOp.emitError("pipelined_for body has no scf.yield terminator");
+  if (numClusters < 2)
+    return forOp.emitError(
+        "pipelined_for body must contain at least two pipeline stages");
   if (existingBarrierMap.count(0) && existingBarrierMap.count(numClusters))
     return forOp.emitError("pipelined_for body has both top-of-loop and "
                            "bottom-of-loop pre-existing barriers");
@@ -342,20 +342,14 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     auto [warpLow, warpHigh] =
         emitPipelinePrelude(b, loc, threadsPerPipelineGroup);
 
-    // 2. Collect existing barrier information.
-    // Scanning the loop body and classifying each consecutive block of
-    // operations into a pipeline cluster (one cluster per execute_region).
-    // While doing this, we also detect any pre-existing barriers located
-    // between clusters.  These barriers may come from prefetch patterns, and
-    // must be preserved, but only at valid cluster boundaries.
+    // 2. Walk the (already-validated) body once to collect clusters and any
+    // pre-existing inter-cluster barriers (e.g. from prefetch patterns).
     SmallVector<Block *> clusterBlocks;
     SmallVector<Operation *> clusterOps;
     SmallVector<bool> bars;
     std::map<int, Operation *> existingBarrierMap;
     Operation *terminatorOp = nullptr;
 
-    // Body shape was already validated by validatePipelinedForBody before
-    // the pattern ran, so we trust the structure here.
     for (auto &op : *forOp.getBody()) {
       if (auto exeOp = dyn_cast<scf::ExecuteRegionOp>(op)) {
         exeOp.setNoInline(false);
@@ -384,10 +378,11 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     // but sometimes required by memory prefetching pattern.
     auto topBar = existingBarrierMap.find(0);
     auto bottomBar = existingBarrierMap.find(numClusters);
+    bool hasTopBarrier = topBar != existingBarrierMap.end();
     if (bottomBar != existingBarrierMap.end()) {
       // validatePipelinedForBody guarantees we cannot have both top and
       // bottom barriers, so rotating bottom -> 0 is unambiguous.
-      assert(topBar == existingBarrierMap.end() &&
+      assert(!hasTopBarrier &&
              "validatePipelinedForBody should have rejected this");
       existingBarrierMap[0] = bottomBar->second;
       existingBarrierMap.erase(bottomBar);
@@ -409,17 +404,26 @@ class ConvertPipelinedForPattern : public OpRewritePattern<scf::ForOp> {
     //    the first cluster barrier must be inserted just before the loop’s
     //    terminator, forming the wrap-around dependency.
     for (int i = 0; i < numClusters; i++) {
+      if (i == 0 && !hasTopBarrier) {
+        // Prime the first iteration's priority.  The loop-carried cluster-0
+        // barrier sits at the bottom of the loop body, so it only controls
+        // the next iteration.
+        b.setInsertionPoint(forOp);
+        emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
+      }
+
       if (auto exBar = existingBarrierMap.find(i);
           exBar != existingBarrierMap.end()) {
+        // FIXME: If bars[i] is true, wrapping a non-LOCAL pre-existing
+        // barrier is not enough to satisfy LDS ordering.  For now we rely on
+        // the producer to place such barriers only where no local fence is
+        // needed.
         wrapExistingBarrier(b, loc, clusterOps[i], exBar->second,
                             anyHasPriority);
       } else {
         b.setInsertionPoint(clusterOps[i]);
         // The first one wraps back to the last of the loop
-        if (i == 0 && topBar == existingBarrierMap.end()) {
-          // Extra setprio needed before the loop for the first cluster
-          b.setInsertionPoint(forOp);
-          emitClusterPriority(b, loc, clusterOps[i], anyHasPriority);
+        if (i == 0 && !hasTopBarrier) {
           // inserts just before yield (=End of the loop).
           b.setInsertionPoint(terminatorOp);
         }
@@ -542,6 +546,9 @@ static void emitPipelinedFlat(SmallVector<scf::ExecuteRegionOp> &clusterOps,
     }
 
     if (existingBarrier) {
+      // FIXME: If bars[i] is true, wrapping a non-LOCAL pre-existing barrier
+      // is not enough to satisfy LDS ordering.  For now we rely on the
+      // producer to place such barriers only where no local fence is needed.
       wrapExistingBarrier(b, loc, clusterOps[i], existingBarrier,
                           anyHasPriority);
     } else {
@@ -605,12 +612,12 @@ static void processUnrolledPipelineRegions(ModuleOp m,
 // Return true if `op` is intra-pipeline glue between two clusters — the
 // sequence emitted by emitClusterBarrier/emitClusterPriority and any
 // pre-existing barrier op that emitPipelinedFlat wraps with sched_barriers.
+// Defined in terms of isWarpPipelineIgnorableBarrier so the two sets stay in
+// sync; the extra cases below are the ones we emit ourselves.
 static bool isIntraPipelineGlue(Operation *op) {
-  return isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp, ROCDL::SBarrierOp,
-             ROCDL::BarrierOp, gpu::BarrierOp, triton::gpu::BarrierOp,
-             triton::gpu::AsyncWaitOp, triton::amdgpu::AsyncWaitOp,
-             triton::amdgpu::AsyncTDMWait,
-             triton::amdgpu::AsyncTDMIntrinsicWait>(op);
+  return isWarpPipelineIgnorableBarrier(op) ||
+         isa<ROCDL::SchedBarrier, ROCDL::SetPrioOp, ROCDL::SBarrierOp,
+             triton::gpu::BarrierOp>(op);
 }
 
 // Walk backward from `exec` past `sched_barrier` / `s_setprio` and check
@@ -754,6 +761,8 @@ static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
                                 Allocation *allocation) {
   int K = loopBlocks.size();
   int M = nextBlocks.size();
+  assert(!loopBars.empty() &&
+         "expected at least one cluster in the prior loop");
 
   SmallVector<BlockInfo> mergedInfo;
   for (auto *b : loopBlocks)
@@ -817,6 +826,9 @@ static bool isCrossPipelineSafe(ArrayRef<Block *> loopBlocks,
 //
 // The "next pipeline" can be either another scf.for or a flat (unrolled)
 // pipeline represented as a sequence of scf.execute_region ops.
+// TODO: This could be generalized to flat-to-loop / flat-to-flat boundaries,
+// but those cases cannot reuse a prior loop's wrap-around barrier as the
+// boundary seed and are not expected to matter for common codegen.
 //
 // Before:                              After:
 //   scf.for { loop 1 }                  scf.for { loop 1 }
@@ -963,7 +975,7 @@ struct ConvertWarpPipeline
     patternInline.add<InlineWarpPipelineExecuteRegionPattern>(&getContext());
 
     if (failed(applyPatternsGreedily(m, std::move(patternFor))))
-      signalPassFailure();
+      return signalPassFailure();
 
     // Flat (unrolled) pipeline regions are still wrapped in execute_regions
     // with no_inline=true from WarpPipeliner.  Process them before inlining.
@@ -976,7 +988,7 @@ struct ConvertWarpPipeline
     eliminateRedundantCondBarriers(m, moduleAllocation);
 
     if (failed(applyPatternsGreedily(m, std::move(patternInline))))
-      signalPassFailure();
+      return signalPassFailure();
   }
 };
 
diff --git a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
index 87c623612cd8..70e463c2c567 100644
--- a/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
+++ b/third_party/amd/python/examples/gluon/f16_gemm_warp_pipeline_gfx1250.py
@@ -66,13 +66,13 @@ def gemm_tdm_pipelined_warp_pipelined_kernel(a_ptr, b_ptr, c_ptr,  #
     consumer = 0
     accumulator = ttgl.zeros((BLOCK_M, BLOCK_N), dtype=c_ptr.type.element_ty, layout=WMMA_LAYOUT)
 
-    # Triple buffering
-    # prefetch 2, the other one is overlapped.
-    for _ in ttgl.static_range(2):
+    # Prefetch NUM_BUFFERS - 1 tiles; the main loop produces one tile for
+    # each tile it consumes, and the epilogue drains the prefetched tail.
+    for _ in ttgl.static_range(NUM_BUFFERS - 1):
         producer = issue_loads(producer, a_desc, b_desc, 0, 0, a_buffer, b_buffer, BLOCK_K, NUM_BUFFERS, TRANSPOSE_B)
 
     # Wait for the first prefetch
-    ttgl.amd.gfx1250.tdm.async_wait(1 * 2)
+    ttgl.amd.gfx1250.tdm.async_wait((NUM_BUFFERS - 2) * 2)
     for _ in range(0, ttgl.cdiv(K, BLOCK_K) - (NUM_BUFFERS - 1)):
         with ttgl.amd.warp_pipeline_stage("stage0", priority=1):
             consumer, a, b = lds_load(consumer, a_buffer, OPERAND_LAYOUT_A, b_buffer, OPERAND_LAYOUT_B, NUM_BUFFERS,

From 13469956bc3158a82fb1844687b485214d45ddd0 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@amd.com>
Date: Thu, 30 Apr 2026 22:41:18 +0000
Subject: [PATCH 19/19] address review

---
 .../TritonAMDGPUTransforms/WarpPipeliner.cpp  | 88 +++++--------------
 1 file changed, 23 insertions(+), 65 deletions(-)

diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
index d591d3424a48..431b00a1bf61 100644
--- a/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
+++ b/third_party/amd/lib/TritonAMDGPUTransforms/WarpPipeliner.cpp
@@ -30,7 +30,7 @@ namespace mlir {
 // existing memory-fence/wait ops at cluster boundaries are tolerated so that
 // prefetch patterns continue to work; encountering one mid-cluster is treated
 // as malformed input by the callers.
-static bool isPipelineIgnorable(Operation *op) {
+static bool canSitBetweenStages(Operation *op) {
   return isa<ttg::AsyncWaitOp, gpu::BarrierOp, triton::gpu::BarrierOp,
              tt::amdgpu::AsyncTDMWait>(op);
 }
@@ -40,22 +40,9 @@ static bool isPipelineBorder(Operation *op) {
   return op->hasAttr("triton.warp_pipeline.border");
 }
 
-// True if `op` is a structured loop (scf.for / scf.while).  Loops are
-// disallowed inside a pipeline stage for several reasons:
-//   1. No useful effect: a non-pipelined loop runs to completion within a
-//      single warp's phase while the other warp waits at the cluster
-//      boundary; the pipeline cannot interleave anything across iterations.
-//      Iteration-level pipelining is expressed by the loop-pipeline form
-//      (scf.for containing warp_pipeline_stage blocks).
-//   2. A cluster is meant to be a straight-line scheduling unit so the
-//      backend can interleave its ops with the other warp's stage; a loop
-//      is an opaque control-flow region the scheduler cannot see across.
-//   3. Our cross-cluster memory-effect analysis is a flat scan over the
-//      cluster body; loop bodies aren't visible through MemoryEffectOpInterface
-//      and loop-carried LDS hazards have nowhere to be barriered (cluster
-//      barriers are boundary-only).
-// As a side effect this also serves as the "no nested warp pipelines" check,
-// since a warp-pipelined scf.for is still an scf.for.
+// True if `op` is a structured loop (scf.for / scf.while).  Pipeline clusters
+// are straight-line scheduling units, so loops remain boundaries instead of
+// being absorbed.  This also rejects nested warp-pipelined scf.for ops.
 static bool isLoopOp(Operation *op) {
   return isa<scf::ForOp, scf::WhileOp>(op);
 }
@@ -176,19 +163,10 @@ static void createClusterOp(OpBuilder &b, Location loc,
   return;
 }
 
-// Sink pure-scalar ops past adjacent ignorable ops so they join the next
-// cluster. After loop unrolling, scalar IV-remap ops (arith.addi/muli) land
-// between borders and the ignorable that starts the next iteration (FA
-// pattern); without this, WarpPipeliner sees scalars as an incomplete
-// cluster when it hits the ignorable and bails out. Single forward pass,
-// O(N).
-//
-// `pending` only accumulates pure scalars and is cleared at any other op,
-// so it forms a closed SSA DAG: any use of a pending scalar by the trailing
-// ignorable run must be a direct operand, so the dependency check is a
-// simple operand scan.
-static void sinkPureScalarsPastIgnorables(Block &blk) {
-  // Accumulates a run of consecutive pure scalars that might be sunk.
+// Move pure scalar IV-remap ops after adjacent inter-stage barriers/waits so
+// they become part of the next stage.  If a barrier/wait uses one of those
+// scalars, leave the run in place to preserve SSA.
+static void sinkPureScalarsIntoNextStage(Block &blk) {
   SmallVector<Operation *> pending;
   auto consumesPending = [&](Operation *user) {
     return llvm::any_of(user->getOperands(), [&](Value v) {
@@ -202,15 +180,11 @@ static void sinkPureScalarsPastIgnorables(Block &blk) {
       op = next;
       continue;
     }
-    // Non-scalar op: try to sink pending past an ignorable run, then reset.
-    if (isPipelineIgnorable(op) && !pending.empty()) {
-      // Extend `anchor` to the last ignorable in the consecutive run.
+    if (canSitBetweenStages(op) && !pending.empty()) {
       Operation *anchor = op;
       while (anchor->getNextNode() &&
-             isPipelineIgnorable(anchor->getNextNode()))
+             canSitBetweenStages(anchor->getNextNode()))
         anchor = anchor->getNextNode();
-      // Abort the sink if any ignorable in [op..anchor] consumes a pending
-      // scalar -- moving its producer past it would break SSA.
       bool conflict = false;
       for (Operation *ign = op; !conflict; ign = ign->getNextNode()) {
         conflict = consumesPending(ign);
@@ -218,7 +192,6 @@ static void sinkPureScalarsPastIgnorables(Block &blk) {
           break;
       }
       if (!conflict) {
-        // Skip past the moved scalars on the next iteration.
         next = anchor->getNextNode();
         // Reverse iteration + moveAfter(anchor) preserves source order:
         // each earlier-inserted scalar is pushed right by later inserts.
@@ -226,8 +199,6 @@ static void sinkPureScalarsPastIgnorables(Block &blk) {
           s->moveAfter(anchor);
       }
     }
-    // Pending is always cleared at a non-scalar op: the run is broken,
-    // either by a successful sink or by an op that anchors them in place.
     pending.clear();
     op = next;
   }
@@ -251,21 +222,21 @@ static PipelineResult createPipeline(OpBuilder &b, Location loc,
   SmallVector<SmallVector<Operation *>> clusters;
   auto ctx = forOp.getContext();
 
-  sinkPureScalarsPastIgnorables(blk);
+  sinkPureScalarsIntoNextStage(blk);
 
   // One pass over the body; collect clusters split by explicit borders.
   for (Operation &opRef : llvm::make_early_inc_range(blk)) {
     Operation *op = &opRef;
-    if (isPipelineBorder(op)) { // Wrap-up one cluster at a border.
+    if (isPipelineBorder(op)) { // Wrap up one cluster at a border.
       clusterMarkers.push_back(readBorderMarker(op));
       addDummyOpIfEmptyCluster(b, loc, op, cluster);
       clusters.push_back(std::move(cluster));
       cluster.clear();
-      op->erase(); // remove the marker
+      op->erase(); // Remove the marker.
       continue;
     }
-    if (isPipelineIgnorable(op)) {
-      // Ignorable ops (barrier / async_wait family) belong between stages,
+    if (canSitBetweenStages(op)) {
+      // Barrier / async_wait family ops belong between stages,
       // never inside one.  Encountering one while a cluster is being built
       // means the user inserted it inside a warp_pipeline_stage region.
       if (!cluster.empty()) {
@@ -282,13 +253,13 @@ static PipelineResult createPipeline(OpBuilder &b, Location loc,
                     "warp_pipeline_stage blocks inside the loop body");
       return PipelineResult::Malformed;
     }
-    if (isa<scf::YieldOp>(op)) // End of the loop
+    if (isa<scf::YieldOp>(op)) // End of the loop.
       break;
 
-    // Keep collecting ops for a cluster.
+    // Keep collecting ops for the current cluster.
     cluster.push_back(op);
   }
-  if (!cluster.empty()) { // create the last cluster if needed.
+  if (!cluster.empty()) { // Create the last cluster if needed.
     clusters.push_back(std::move(cluster));
     auto clusterStr = StringAttr::get(ctx, "last_cluster");
     clusterMarkers.push_back({clusterStr, -1});
@@ -353,22 +324,12 @@ static PipelineResult createFlatPipeline(OpBuilder &b, Block &block) {
   Operation *firstBorder = allBorders.front();
   Operation *lastBorder = allBorders.back();
 
-  // 2. Locate the start of the first stage.  Unlike createPipeline, the flat
-  //    sequence has no loop body to anchor against, so walk backwards from
-  //    the first border, stopping at things that must not be folded into
-  //    stage 0:
-  //      - a structured loop (scf.for / scf.while) -- see isLoopOp for
-  //        rationale; this also covers already-warp-pipelined loops, so the
-  //        "no nesting" rule falls out for free
-  //      - a phase-control op (defensive; not produced before this pass)
-  //      - an inter-stage barrier from a previous pipeline
-  //    Other structured control flow (scf.if, etc.) is absorbed into stage 0
-  //    since execution falls through it linearly at this stage of the
-  //    pipeline.
+  // 2. For flat pipelines, stage 0 may include the ops immediately before the
+  //    first border.  Stop at ops that must stay outside this pipeline.
   Operation *regionStart = firstBorder;
   for (Operation *op = firstBorder->getPrevNode(); op; op = op->getPrevNode()) {
     if (isLoopOp(op) || isa<tt::amdgpu::CondBarrierOp>(op) ||
-        isPipelineIgnorable(op))
+        canSitBetweenStages(op))
       break;
     regionStart = op;
   }
@@ -397,7 +358,7 @@ static PipelineResult createFlatPipeline(OpBuilder &b, Block &block) {
       continue;
     }
 
-    if (isPipelineIgnorable(op)) {
+    if (canSitBetweenStages(op)) {
       // Same rule as createPipeline: barriers/waits cannot live inside a
       // stage.
       if (!cluster.empty()) {
@@ -419,10 +380,7 @@ static PipelineResult createFlatPipeline(OpBuilder &b, Block &block) {
     cluster.push_back(op);
   }
 
-  // 4. Materialize each cluster as an execute_region.  With at least two
-  //    borders the sweep should always produce >= 2 clusters; treat anything
-  //    less as a defensive malformed case.  Note: the borders themselves
-  //    were erased during the sweep, so we attach the diagnostic to `loc`.
+  // 4. The bounded sweep should produce at least two clusters.
   if (clusters.size() < 2) {
     mlir::emitError(
         loc, "warp_pipeline_stage borders did not produce at least two stages");