diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index f00b99075b022..2da41348244d7 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -3025,7 +3025,7 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef MergeInfos, bool StrictDom = false; LLVM_DEBUG(dbgs() << "Merging async marks ..."); - // Early exit: both empty + // Early exit: nothing to merge when both sides are empty. if (AsyncMarks.empty() && OtherMarks.empty()) { LLVM_DEBUG(dbgs() << " nothing to merge\n"); return false; @@ -3067,6 +3067,11 @@ bool WaitcntBrackets::mergeAsyncMarks(ArrayRef MergeInfos, unsigned OtherSize = OtherMarks.size(); unsigned OurSize = AsyncMarks.size(); unsigned MergeCount = std::min(OtherSize, OurSize); + // OtherMarks is empty -> OtherSize == 0 -> MergeCount == 0. + // Our existing marks are the conservative result; return early to avoid + // passing MergeCount == 0 to seq_inclusive which asserts Begin <= End. + if (MergeCount == 0) + return StrictDom; for (auto Idx : seq_inclusive(1, MergeCount)) { for (auto T : inst_counter_types(Context->MaxCounter)) { StrictDom |= mergeScore(MergeInfos[T], AsyncMarks[OurSize - Idx][T], diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-merge-empty-other.mir b/llvm/test/CodeGen/AMDGPU/asyncmark-merge-empty-other.mir new file mode 100644 index 0000000000000..cf3a46d39fcb2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/asyncmark-merge-empty-other.mir @@ -0,0 +1,139 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=si-insert-waitcnts -o - %s | FileCheck %s + +# Regression test for mergeAsyncMarks() asserting when OtherMarks is empty. +# +# At a CFG join point where one predecessor has an ASYNCMARK (non-empty +# AsyncMarks) and the other does not (empty OtherMarks), MergeCount becomes +# min(0, N) = 0. Before the fix, seq_inclusive(1, 0) asserted +# Begin <= End. After the fix the function returns early when either side +# is empty. +# +# GLOBAL_LOAD_ASYNC_TO_LDS_B32 is a GFX1250 async LDS DMA instruction tracked +# via ASYNC_CNT. isAsync() returns true for it, so the score is recorded into +# AsyncScore before ASYNCMARK pushes it onto AsyncMarks. +# +# The join block contains WAIT_ASYNCMARK 0 to consume the pending mark. +# Before the fix, mergeAsyncMarks() asserted before reaching the wait. +# After the fix the pass completes without asserting. +# +# Two patterns are tested: +# asyncmark_in_then - ASYNCMARK in the then-successor, else-successor is sync +# asyncmark_in_else - ASYNCMARK in the else-successor, then-successor is sync + +--- +# Pattern 1: ASYNCMARK in then-successor, else-successor is sync. +name: asyncmark_in_then +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 8 +body: | + ; CHECK-LABEL: name: asyncmark_in_then + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; CHECK-NEXT: S_WAIT_KMCNT 0 + ; CHECK-NEXT: S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3) + ; CHECK-NEXT: ASYNCMARK + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: WAIT_ASYNCMARK 0 + ; CHECK-NEXT: S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2 + + S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + + ; then branch — issues async LDS DMA + ASYNCMARK + bb.1: + liveins: $vgpr0_vgpr1, $vgpr2 + + GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3) + ASYNCMARK + S_BRANCH %bb.3 + + ; else branch — sync path, no async operations; OtherMarks is empty at join + bb.2: + S_BRANCH %bb.3 + + ; join — mergeAsyncMarks sees non-empty AsyncMarks (then) and empty OtherMarks (else). + ; Before fix: assertion. After fix: returns early, no spurious wait inserted. + bb.3: + WAIT_ASYNCMARK 0 + S_ENDPGM 0 +... +--- +# Pattern 2: ASYNCMARK in else-successor, then-successor is sync. +# Mirror of asyncmark_in_then — exercises the opposite predecessor ordering. +name: asyncmark_in_else +tracksRegLiveness: true +machineFunctionInfo: + occupancy: 8 +body: | + ; CHECK-LABEL: name: asyncmark_in_else + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_WAIT_LOADCNT_DSCNT 0 + ; CHECK-NEXT: S_WAIT_KMCNT 0 + ; CHECK-NEXT: S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: liveins: $vgpr0_vgpr1, $vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3) + ; CHECK-NEXT: ASYNCMARK + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: WAIT_ASYNCMARK 0 + ; CHECK-NEXT: S_WAIT_ASYNCCNT 0, implicit-def $asynccnt, implicit $asynccnt + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0_vgpr1, $vgpr2 + + S_CMP_LG_U32 $sgpr0, $sgpr1, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + + ; then branch — sync path, no async operations + bb.1: + S_BRANCH %bb.3 + + ; else branch — issues async LDS DMA + ASYNCMARK + bb.2: + liveins: $vgpr0_vgpr1, $vgpr2 + + GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt :: (load (s32), addrspace 1), (store (s32), addrspace 3) + ASYNCMARK + S_BRANCH %bb.3 + + ; join block + bb.3: + WAIT_ASYNCMARK 0 + S_ENDPGM 0 +...