diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index bdc08101c7119..12086ba92f2a3 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -1051,18 +1051,20 @@ bool UnclusteredHighRPStage::initGCNSchedStage() { createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PreRAReentry)); InitialOccupancy = DAG.MinOccupancy; - // Aggressivly try to reduce register pressure in the unclustered high RP + // Aggressively try to reduce register pressure in the unclustered high RP // stage. Temporarily increase occupancy target in the region. + TempTargetOccupancy = MFI.getMaxWavesPerEU() > DAG.MinOccupancy + ? InitialOccupancy + 1 + : InitialOccupancy; + IsAnyRegionScheduled = false; S.SGPRLimitBias = S.HighRPSGPRBias; S.VGPRLimitBias = S.HighRPVGPRBias; - if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) - MFI.increaseOccupancy(MF, ++DAG.MinOccupancy); LLVM_DEBUG( dbgs() << "Retrying function scheduling without clustering. " - "Aggressivly try to reduce register pressure to achieve occupancy " - << DAG.MinOccupancy << ".\n"); + "Aggressively try to reduce register pressure to achieve occupancy " + << TempTargetOccupancy << ".\n"); return true; } @@ -1143,9 +1145,16 @@ void UnclusteredHighRPStage::finalizeGCNSchedStage() { SavedMutations.swap(DAG.Mutations); S.SGPRLimitBias = S.VGPRLimitBias = 0; if (DAG.MinOccupancy > InitialOccupancy) { + assert(IsAnyRegionScheduled); LLVM_DEBUG(dbgs() << StageID << " stage successfully increased occupancy to " << DAG.MinOccupancy << '\n'); + } else if (!IsAnyRegionScheduled) { + assert(DAG.MinOccupancy == InitialOccupancy); + LLVM_DEBUG(dbgs() << StageID + << ": No regions scheduled, min occupancy stays at " + << DAG.MinOccupancy << ", MFI occupancy stays at " + << MFI.getOccupancy() << ".\n"); } GCNSchedStage::finalizeGCNSchedStage(); @@ -1219,13 +1228,27 @@ bool UnclusteredHighRPStage::initGCNRegion() { // rescheduling of previous regions did not make occupancy drop back down to // the initial minimum). unsigned DynamicVGPRBlockSize = DAG.MFI.getDynamicVGPRBlockSize(); + // If no region has been scheduled yet, the DAG has not yet been updated with + // the occupancy target. So retrieve it from the temporary. + unsigned CurrentTargetOccupancy = + IsAnyRegionScheduled ? DAG.MinOccupancy : TempTargetOccupancy; if (!DAG.RegionsWithExcessRP[RegionIdx] && - (DAG.MinOccupancy <= InitialOccupancy || + (CurrentTargetOccupancy <= InitialOccupancy || DAG.Pressure[RegionIdx].getOccupancy(ST, DynamicVGPRBlockSize) != InitialOccupancy)) return false; - return GCNSchedStage::initGCNRegion(); + bool IsSchedulingThisRegion = GCNSchedStage::initGCNRegion(); + // If this is the first region scheduled during this stage, make the target + // occupancy changes in the DAG and MFI. + if (!IsAnyRegionScheduled && IsSchedulingThisRegion) { + IsAnyRegionScheduled = true; + if (MFI.getMaxWavesPerEU() > DAG.MinOccupancy) { + DAG.MinOccupancy = TempTargetOccupancy; + MFI.increaseOccupancy(MF, TempTargetOccupancy); + } + } + return IsSchedulingThisRegion; } bool ClusteredLowOccStage::initGCNRegion() { diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h index 8ea42677454e4..026f1056af241 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -402,6 +402,10 @@ class UnclusteredHighRPStage : public GCNSchedStage { private: // Save the initial occupancy before starting this stage. unsigned InitialOccupancy; + // Save the temporary target occupancy before starting this stage. + unsigned TempTargetOccupancy; + // Track whether any region was scheduled by this stage. + bool IsAnyRegionScheduled; public: bool initGCNSchedStage() override; diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir new file mode 100644 index 0000000000000..f08facb503f24 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-no-unclustered-regions.mir @@ -0,0 +1,56 @@ +# REQUIRES: asserts +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -start-before=machine-scheduler -stop-after=greedy,2 -stress-regalloc=4 -debug-only=machine-scheduler %s -o - 2>&1 | FileCheck %s + +--- | + define amdgpu_kernel void @no_sched_metric_due_to_spills() #0 { + ret void + } + + attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } +... + +# This test checks for the following scenario: Unclustered high-RP-reschedule +# stage raises the occupancy target temporarily but no region gets scheduled +# because of constraints. Then, DAG and MFI min-occupancy should not be changed +# at the end of the unclustered schedule stage. +# CHECK: Retrying function scheduling without clustering. Aggressively try to reduce register pressure to achieve occupancy 5. +# CHECK: Unclustered High Register Pressure Reschedule: No regions scheduled, min occupancy stays at 4, MFI occupancy stays at 4. + +--- +name: no_sched_metric_due_to_spills +tracksRegLiveness: true +machineFunctionInfo: + stackPtrOffsetReg: '$sgpr32' + occupancy: 4 +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1, $sgpr15 + + %0:sgpr_32 = COPY $sgpr15 + %1:sgpr_64 = COPY $sgpr0_sgpr1 + %2:vgpr_32 = COPY $vgpr0 + %3:sgpr_128 = S_LOAD_DWORDX4_IMM %1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4) + undef %4.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %1, 16, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4) + %5:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 32, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %6:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 64, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %7:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 84, 0 :: (dereferenceable invariant load (s32), addrspace 4) + %8:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 112, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %9:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 128, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %10:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 176, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %11:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %1, 192, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4) + %12:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 216, 0 :: (dereferenceable invariant load (s64), addrspace 4) + %13:sreg_32 = S_ADD_I32 %12.sub0, 127, implicit-def dead $scc + %14:sreg_32 = S_ASHR_I32 %13, 31, implicit-def dead $scc + %15:sreg_32 = S_LSHR_B32 %14, 25, implicit-def dead $scc + %16:sreg_32 = S_ADD_I32 %13, %15, implicit-def dead $scc + %17:sreg_32 = S_ASHR_I32 %16, 7, implicit-def dead $scc + %18:sreg_32 = S_ADD_I32 %12.sub1, 255, implicit-def dead $scc + %19:sreg_32 = S_ASHR_I32 %18, 31, implicit-def dead $scc + %20:sreg_32 = S_LSHR_B32 %19, 24, implicit-def dead $scc + %21:sreg_32 = S_ADD_I32 %18, %20, implicit-def dead $scc + %22:sreg_32 = S_ASHR_I32 %21, 8, implicit-def dead $scc + %23:sreg_32 = nsw S_MUL_I32 %22, %17 + %24:sreg_32 = S_ASHR_I32 %0, 31, implicit-def dead $scc + S_ENDPGM 0 + +...