diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index b7fa899678ec7..306d59d0867cd 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1291,21 +1291,15 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // On entry to a block with multiple predescessors, there may // be pending SMEM and VMEM events active at the same time. // In such cases, only clear one active event at a time. - auto applyPendingXcntGroup = [this](unsigned E) { - unsigned LowerBound = getScoreLB(X_CNT); - applyWaitcnt(X_CNT, 0); - PendingEvents |= (1 << E); - setScoreLB(X_CNT, LowerBound); - }; // Wait on XCNT is redundant if we are already waiting for a load to complete. // SMEM can return out of order, so only omit XCNT wait if we are waiting till // zero. if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { - if (hasPendingEvent(VMEM_GROUP)) - applyPendingXcntGroup(VMEM_GROUP); - else + if (!hasMixedPendingEvents(X_CNT)) applyWaitcnt(X_CNT, 0); + else + PendingEvents &= ~(1 << SMEM_GROUP); return; } @@ -1314,10 +1308,10 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // decremented to the same number as LOADCnt. if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && !hasPendingEvent(STORE_CNT)) { - if (hasPendingEvent(SMEM_GROUP)) - applyPendingXcntGroup(SMEM_GROUP); - else + if (!hasMixedPendingEvents(X_CNT)) applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); + else if (Wait.LoadCnt == 0) + PendingEvents &= ~(1 << VMEM_GROUP); return; } diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir index f964480dcc633..fe16f0d44dd1c 100644 --- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir @@ -1069,7 +1069,6 @@ body: | $sgpr0 = S_MOV_B32 $sgpr0 ... -# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0. --- name: mixed_pending_events tracksRegLiveness: true @@ -1088,8 +1087,8 @@ body: | ; GCN-NEXT: successors: %bb.2(0x80000000) ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.2: ; GCN-NEXT: liveins: $sgpr2, $vgpr2 @@ -1098,6 +1097,7 @@ body: | ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec ; GCN-NEXT: S_WAIT_KMCNT 0 ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2 + ; GCN-NEXT: S_WAIT_XCNT 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.0: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc @@ -1105,8 +1105,8 @@ body: | S_CBRANCH_SCC1 %bb.2, implicit $scc bb.1: liveins: $vgpr0_vgpr1, $sgpr2 - $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec - $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec + $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 100, 0, implicit $exec + $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 200, 0, implicit $exec bb.2: liveins: $sgpr2, $vgpr2 $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec