Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 30 additions & 7 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1051,18 +1051,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry));

InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
// Aggressively try to reduce register pressure in the unclustered high RP
// stage. Temporarily increase occupancy target in the region.
TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy
? InitialOccupancy + 1
: InitialOccupancy;
IsAnyRegionScheduled = false;
S.SGPRLimitBias = S.HighRPSGPRBias;
S.VGPRLimitBias = S.HighRPVGPRBias;
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy)
MFI.increaseOccupancy(MF, ++DAG.MinOccupancy);

LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling without clustering. "
"Aggressivly try to reduce register pressure to achieve occupancy "
<< DAG.MinOccupancy << ".\n");
"Aggressively try to reduce register pressure to achieve occupancy "
<< TempTargetOccupancy << ".\n");

return true;
}
Expand Down Expand Up @@ -1143,9 +1145,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() {
SavedMutations.swap(DAG.Mutations);
S.SGPRLimitBias = S.VGPRLimitBias = 0;
if (DAG.MinOccupancy > InitialOccupancy) {
assert(IsAnyRegionScheduled);
LLVM_DEBUG(dbgs() << StageID
<< " stage successfully increased occupancy to "
<< DAG.MinOccupancy << '\n');
} else if (!IsAnyRegionScheduled) {
assert(DAG.MinOccupancy == InitialOccupancy);
LLVM_DEBUG(dbgs() << StageID
<< ": No regions scheduled, min occupancy stays at "
<< DAG.MinOccupancy << ", MFI occupancy stays at "
<< MFI.getOccupancy() << ".\n");
}

GCNSchedStage::finalizeGCNSchedStage();
Expand Down Expand Up @@ -1219,13 +1228,27 @@ bool UnclusteredHighRPStage::initGCNRegion() {
// rescheduling of previous regions did not make occupancy drop back down to
// the initial minimum).
unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize();
// If no region has been scheduled yet, the DAG has not yet been updated with
// the occupancy target. So retrieve it from the temporary.
unsigned CurrentTargetOccupancy =
IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy;
if (!DAG.RegionsWithExcessRP[RegionIdx] &&
(DAG.MinOccupancy <= InitialOccupancy ||
(CurrentTargetOccupancy <= InitialOccupancy ||
DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) !=
InitialOccupancy))
return false;

return GCNSchedStage::initGCNRegion();
bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion();
// If this is the first region scheduled during this stage, make the target
// occupancy changes in the DAG and MFI.
if (!IsAnyRegionScheduled && IsSchedulingThisRegion) {
IsAnyRegionScheduled = true;
if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) {
DAG.MinOccupancy = TempTargetOccupancy;
MFI.increaseOccupancy(MF, TempTargetOccupancy);
}
}
return IsSchedulingThisRegion;
}

bool ClusteredLowOccStage::initGCNRegion() {
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,10 @@ class UnclusteredHighRPStage : public GCNSchedStage {
private:
// Save the initial occupancy before starting this stage.
unsigned InitialOccupancy;
// Save the temporary target occupancy before starting this stage.
unsigned TempTargetOccupancy;
// Track whether any region was scheduled by this stage.
bool IsAnyRegionScheduled;

public:
bool initGCNSchedStage() override;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# REQUIRES: asserts
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -stress-regalloc=4 -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s

--- |
define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 {
ret void
}

attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
...

# This test checks for the following scenario: Unclustered high-RP-reschedule
# stage raises the occupancy target temporarily but no region gets scheduled
# because of constraints. Then, DAG and MFI min-occupancy should not be changed
# at the end of the unclustered schedule stage.
# CHECK: Retrying function scheduling without clustering. Aggressively try to reduce register pressure to achieve occupancy 5.
# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, min occupancy stays at 4, MFI occupancy stays at 4.

---
name: no_sched_metric_due_to_spills
tracksRegLiveness: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
occupancy: 4
body: |
bb.0:
liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15

%0:sgpr_32 = COPY $sgpr15
%1:sgpr_64 = COPY $sgpr0_sgpr1
%2:vgpr_32 = COPY $vgpr0
%3:sgpr_128 = S_LOAD_DWORDX4_IMM %1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
%5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 84, 0 :: (dereferenceable invariant load (s32), addrspace 4)
%8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
%12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 216, 0 :: (dereferenceable invariant load (s64), addrspace 4)
%13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc
%14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc
%15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc
%16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc
%17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc
%18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc
%19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc
%20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc
%21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc
%22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc
%23:sreg_32 = nsw S_MUL_I32 %22, %17
%24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc
S_ENDPGM 0

...
Loading