Skip to content
Merged
7 changes: 6 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3025,7 +3025,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
bool StrictDom = false;

LLVM_DEBUG(dbgs() << "Merging async marks ...");
// Early exit: both empty
// Early exit: nothing to merge when both sides are empty.
if (AsyncMarks.empty() && OtherMarks.empty()) {
LLVM_DEBUG(dbgs() << " nothing to merge\n");
return false;
Expand Down Expand Up @@ -3067,6 +3067,11 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef<MergeInfo> MergeInfos,
unsigned OtherSize = OtherMarks.size();
unsigned OurSize = AsyncMarks.size();
unsigned MergeCount = std::min(OtherSize, OurSize);
// OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0.
// Our existing marks are the conservative result; return early to avoid
// passing MergeCount == 0 to seq_inclusive which asserts Begin <= End.
if (MergeCount == 0)
return StrictDom;
for (auto Idx : seq_inclusive<unsigned>(1, MergeCount)) {
for (auto T : inst_counter_types(Context->MaxCounter)) {
StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T],
Expand Down
139 changes: 139 additions & 0 deletions llvm/test/CodeGen/AMDGPU/asyncmark-merge-empty-other.mir
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s

# Regression test for mergeAsyncMarks() asserting when OtherMarks is empty.
#
# At a CFG join point where one predecessor has an ASYNCMARK (non-empty
# AsyncMarks) and the other does not (empty OtherMarks), MergeCount becomes
# min(0, N) = 0. Before the fix, seq_inclusive<unsigned>(1, 0) asserted
# Begin <= End. After the fix the function returns early when either side
# is empty.
#
# GLOBAL_LOAD_ASYNC_TO_LDS_B32 is a GFX1250 async LDS DMA instruction tracked
# via ASYNC_CNT. isAsync() returns true for it, so the score is recorded into
# AsyncScore before ASYNCMARK pushes it onto AsyncMarks.
#
# The join block contains WAIT_ASYNCMARK 0 to consume the pending mark.
# Before the fix, mergeAsyncMarks() asserted before reaching the wait.
# After the fix the pass completes without asserting.
#
# Two patterns are tested:
# asyncmark_in_then - ASYNCMARK in the then-successor, else-successor is sync
# asyncmark_in_else - ASYNCMARK in the else-successor, then-successor is sync

---
# Pattern 1: ASYNCMARK in then-successor, else-successor is sync.
name: asyncmark_in_then
tracksRegLiveness: true
machineFunctionInfo:
occupancy: 8
body: |
; CHECK-LABEL: name: asyncmark_in_then
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAIT_LOADCNT_DSCNT 0
; CHECK-NEXT: S_WAIT_KMCNT 0
; CHECK-NEXT: S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
; CHECK-NEXT: ASYNCMARK
; CHECK-NEXT: S_BRANCH %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_BRANCH %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: WAIT_ASYNCMARK 0
; CHECK-NEXT: S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt
; CHECK-NEXT: S_ENDPGM 0
bb.0:
liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2

S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit killed $scc

; then branch — issues async LDS DMA + ASYNCMARK
bb.1:
liveins: $vgpr0_vgpr1, $vgpr2

GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
ASYNCMARK
S_BRANCH %bb.3

; else branch — sync path, no async operations; OtherMarks is empty at join
bb.2:
S_BRANCH %bb.3

; join — mergeAsyncMarks sees non-empty AsyncMarks (then) and empty OtherMarks (else).
; Before fix: assertion. After fix: returns early, no spurious wait inserted.
bb.3:
WAIT_ASYNCMARK 0
S_ENDPGM 0
...
---
# Pattern 2: ASYNCMARK in else-successor, then-successor is sync.
# Mirror of asyncmark_in_then — exercises the opposite predecessor ordering.
name: asyncmark_in_else
tracksRegLiveness: true
machineFunctionInfo:
occupancy: 8
body: |
; CHECK-LABEL: name: asyncmark_in_else
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_WAIT_LOADCNT_DSCNT 0
; CHECK-NEXT: S_WAIT_KMCNT 0
; CHECK-NEXT: S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_BRANCH %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
; CHECK-NEXT: ASYNCMARK
; CHECK-NEXT: S_BRANCH %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: WAIT_ASYNCMARK 0
; CHECK-NEXT: S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt
; CHECK-NEXT: S_ENDPGM 0
bb.0:
liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2

S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc
S_CBRANCH_SCC1 %bb.2, implicit killed $scc

; then branch — sync path, no async operations
bb.1:
S_BRANCH %bb.3

; else branch — issues async LDS DMA + ASYNCMARK
bb.2:
liveins: $vgpr0_vgpr1, $vgpr2

GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3)
ASYNCMARK
S_BRANCH %bb.3

; join block
bb.3:
WAIT_ASYNCMARK 0
S_ENDPGM 0
...
Loading