llvm · john-brawn-arm · Mar 18, 2026 · Jan 8, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1328,6 +1328,16 @@ class TargetTransformInfo {
   /// \return the target-provided register class name
   LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const;
 
+  /// \return the cost of spilling a register in the target-provided register
+  /// class to the stack.
+  LLVM_ABI InstructionCost
+  getRegisterClassSpillCost(unsigned ClassID, TargetCostKind CostKind) const;
+
+  /// \return the cost of reloading a register in the target-provided register
+  /// class from the stack.
+  LLVM_ABI InstructionCost
+  getRegisterClassReloadCost(unsigned ClassID, TargetCostKind CostKind) const;
+
   enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector };
 
   /// \return The width of the largest scalar or vector register type.

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -628,6 +628,18 @@ class TargetTransformInfoImplBase {
     }
   }
 
+  virtual InstructionCost
+  getRegisterClassSpillCost(unsigned ClassID,
+                            TTI::TargetCostKind CostKind) const {
+    return TTI::TCC_Basic;
+  }
+
+  virtual InstructionCost
+  getRegisterClassReloadCost(unsigned ClassID,
+                             TTI::TargetCostKind CostKind) const {
+    return TTI::TCC_Basic;
+  }
+
   virtual TypeSize
   getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     return TypeSize::getFixed(32);

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -815,6 +815,16 @@ const char *TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
   return TTIImpl->getRegisterClassName(ClassID);
 }
 
+InstructionCost TargetTransformInfo::getRegisterClassSpillCost(
+    unsigned ClassID, TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getRegisterClassSpillCost(ClassID, CostKind);
+}
+
+InstructionCost TargetTransformInfo::getRegisterClassReloadCost(
+    unsigned ClassID, TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getRegisterClassReloadCost(ClassID, CostKind);
+}
+
 TypeSize TargetTransformInfo::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
   return TTIImpl->getRegisterBitWidth(K);

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,6 +45,7 @@ class OptimizationRemarkEmitter;
 class TargetTransformInfo;
 class TargetLibraryInfo;
 class VPRecipeBuilder;
+struct VPRegisterUsage;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;
@@ -497,7 +498,7 @@ class LoopVectorizationPlanner {
   ///
   /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
   /// been retired.
-  InstructionCost cost(VPlan &Plan, ElementCount VF) const;
+  InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const;
 
   /// Precompute costs for certain instructions using the legacy cost model. The
   /// function is used to bring up the VPlan-based cost model to initially avoid

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4247,13 +4247,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      /// If the register pressure needs to be considered for VF,
-      /// don't consider the VF as valid if it exceeds the number
-      /// of registers for the target.
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
-        continue;
-
       InstructionCost C = CM.expectedCost(VF);
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
@@ -4302,6 +4295,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
         }
       }
 
+      // Add the cost of any spills due to excess register usage
+      if (CM.shouldConsiderRegPressureForVF(VF))
+        C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);
+
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
       unsigned Width =
           estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
@@ -4687,22 +4684,23 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   if (hasFindLastReductionPhi(Plan))
     return 1;
 
+  VPRegisterUsage R =
+      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
+
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
     if (VF.isScalar())
       LoopCost = CM.expectedCost(VF);
     else
-      LoopCost = cost(Plan, VF);
+      LoopCost = cost(Plan, VF, &R);
     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
 
     // Loop body is free and there is no need for interleaving.
     if (LoopCost == 0)
       return 1;
   }
 
-  VPRegisterUsage R =
-      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   for (auto &Pair : R.MaxLocalUsers) {
@@ -7027,13 +7025,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   return Cost;
 }
 
-InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
-                                               ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
+                                               VPRegisterUsage *RU) const {
   VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
+
+  // Add the cost of spills due to excess register usage
+  if (CM.shouldConsiderRegPressureForVF(VF))
+    Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs);
+
 #ifndef NDEBUG
   unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
@@ -7233,9 +7236,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
                                P->vectorFactors().end());
 
     SmallVector<VPRegisterUsage, 8> RUs;
-    if (any_of(VFs, [this](ElementCount VF) {
-          return CM.shouldConsiderRegPressureForVF(VF);
-        }))
+    bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
+      return CM.shouldConsiderRegPressureForVF(VF);
+    });
+    if (ConsiderRegPressure)
       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
 
     for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7258,16 +7262,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
         continue;
       }
 
-      InstructionCost Cost = cost(*P, VF);
+      InstructionCost Cost =
+          cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
-        LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
-                          << VF << " because it uses too many registers\n");
-        continue;
-      }
-
       if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
         BestFactor = CurrentFactor;
 

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -389,13 +389,28 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
   return Base::properlyDominates(ParentA, ParentB);
 }
 
-bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                                        unsigned OverrideMaxNumRegs) const {
-  return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
-    return LU.second > (OverrideMaxNumRegs > 0
-                            ? OverrideMaxNumRegs
-                            : TTI.getNumberOfRegisters(LU.first));
-  });
+InstructionCost VPRegisterUsage::spillCost(VPCostContext &Ctx,
+                                           unsigned OverrideMaxNumRegs) const {
+  InstructionCost Cost;
+  for (const auto &[RegClass, MaxUsers] : MaxLocalUsers) {
+    unsigned AvailableRegs = OverrideMaxNumRegs > 0
+                                 ? OverrideMaxNumRegs
+                                 : Ctx.TTI.getNumberOfRegisters(RegClass);
+    if (MaxUsers > AvailableRegs) {
+      // Assume that for each register used past what's available we get one
+      // spill and reload.
+      unsigned Spills = MaxUsers - AvailableRegs;
+      InstructionCost SpillCost =
+          Ctx.TTI.getRegisterClassSpillCost(RegClass, Ctx.CostKind) +
+          Ctx.TTI.getRegisterClassReloadCost(RegClass, Ctx.CostKind);
+      InstructionCost TotalCost = Spills * SpillCost;
+      LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from "
+                        << Spills << " spills of "
+                        << Ctx.TTI.getRegisterClassName(RegClass) << "\n");
+      Cost += TotalCost;
+    }
+  }
+  return Cost;
 }
 
 SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -30,6 +30,9 @@ class VPlan;
 class Value;
 class TargetTransformInfo;
 class Type;
+class InstructionCost;
+
+struct VPCostContext;
 
 /// An analysis for type-inference for VPValues.
 /// It infers the scalar type for a given VPValue by bottom-up traversing
@@ -79,11 +82,11 @@ struct VPRegisterUsage {
   /// The key is ClassID of target-provided register class.
   SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
 
-  /// Check if any of the tracked live intervals exceeds the number of
-  /// available registers for the target. If non-zero, OverrideMaxNumRegs
+  /// Calculate the estimated cost of any spills due to using more registers
+  /// than the number available for the target. If non-zero, OverrideMaxNumRegs
   /// is used in place of the target's number of registers.
-  bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                         unsigned OverrideMaxNumRegs = 0) const;
+  InstructionCost spillCost(VPCostContext &Ctx,
+                            unsigned OverrideMaxNumRegs = 0) const;
 };
 
 /// Estimate the register usage for \p Plan and vectorization factors in \p VFs

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -1,16 +1,31 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
+; The use of the dotp instruction means we never have an i32 vector, so we don't
+; get any spills normally and with a reduced number of registers the number of
+; spills is small enough that it doesn't prevent use of a larger VF.
 define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'dotp'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.3)
 ; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 14 (Estimated cost per lane: 1.8)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.8)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
 entry:
   br label %for.body
 
@@ -24,8 +39,7 @@ for.body:                                         ; preds = %for.body, %entry
   %load.b = load i8, ptr %gep.b, align 1
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
-  %sub = sub i32 0, %mul
-  %add = add i32 %accum, %sub
+  %add = add i32 %accum, %mul
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body
@@ -34,4 +48,70 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+; The largest type used in the loop is small enough that we already consider all
+; VFs and maximize-bandwidth does nothing.
+define void @type_too_small(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
+; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.4)
+; CHECK: LV: Selecting VF: vscale x 16.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.a, %load.b
+  store i8 %add, ptr %gep.a, align 1
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; With reduced number of registers the spills from high pressure are enough that
+; we use the same VF as if we hadn't maximized the bandwidth.
+define void @high_pressure(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.2)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.3)
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.4)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %add = add i32 %load.a, %ext.b
+  store i32 %add, ptr %gep.a, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }