diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index a013122df5f06..799452094b7b9 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1328,6 +1328,16 @@ class TargetTransformInfo {
   /// \return the target-provided register class name
   LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const;
 
+  /// \return the cost of spilling a register in the target-provided register
+  /// class to the stack.
+  LLVM_ABI InstructionCost
+  getRegisterClassSpillCost(unsigned ClassID, TargetCostKind CostKind) const;
+
+  /// \return the cost of reloading a register in the target-provided register
+  /// class from the stack.
+  LLVM_ABI InstructionCost
+  getRegisterClassReloadCost(unsigned ClassID, TargetCostKind CostKind) const;
+
   enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector };
 
   /// \return The width of the largest scalar or vector register type.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6d27cabf404f8..6cf1aa916c25f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -628,6 +628,18 @@ class TargetTransformInfoImplBase {
     }
   }
 
+  virtual InstructionCost
+  getRegisterClassSpillCost(unsigned ClassID,
+                            TTI::TargetCostKind CostKind) const {
+    return TTI::TCC_Basic;
+  }
+
+  virtual InstructionCost
+  getRegisterClassReloadCost(unsigned ClassID,
+                             TTI::TargetCostKind CostKind) const {
+    return TTI::TCC_Basic;
+  }
+
   virtual TypeSize
   getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     return TypeSize::getFixed(32);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 19785204ed2b3..204738cd714a0 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -815,6 +815,16 @@ const char *TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
   return TTIImpl->getRegisterClassName(ClassID);
 }
 
+InstructionCost TargetTransformInfo::getRegisterClassSpillCost(
+    unsigned ClassID, TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getRegisterClassSpillCost(ClassID, CostKind);
+}
+
+InstructionCost TargetTransformInfo::getRegisterClassReloadCost(
+    unsigned ClassID, TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getRegisterClassReloadCost(ClassID, CostKind);
+}
+
 TypeSize TargetTransformInfo::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
   return TTIImpl->getRegisterBitWidth(K);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 44d4d92d4a7e2..56e94782abd07 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -45,6 +45,7 @@ class OptimizationRemarkEmitter;
 class TargetTransformInfo;
 class TargetLibraryInfo;
 class VPRecipeBuilder;
+struct VPRegisterUsage;
 struct VFRange;
 
 extern cl::opt<bool> EnableVPlanNativePath;
@@ -497,7 +498,7 @@ class LoopVectorizationPlanner {
   ///
   /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
   /// been retired.
-  InstructionCost cost(VPlan &Plan, ElementCount VF) const;
+  InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const;
 
   /// Precompute costs for certain instructions using the legacy cost model. The
   /// function is used to bring up the VPlan-based cost model to initially avoid
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index abac45b265d10..492e716fd6ad2 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4247,13 +4247,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
       if (VF.isScalar())
         continue;
 
-      /// If the register pressure needs to be considered for VF,
-      /// don't consider the VF as valid if it exceeds the number
-      /// of registers for the target.
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
-        continue;
-
       InstructionCost C = CM.expectedCost(VF);
 
       // Add on other costs that are modelled in VPlan, but not in the legacy
@@ -4302,6 +4295,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
         }
       }
 
+      // Add the cost of any spills due to excess register usage
+      if (CM.shouldConsiderRegPressureForVF(VF))
+        C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);
+
       VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
       unsigned Width =
           estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
@@ -4687,13 +4684,16 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   if (hasFindLastReductionPhi(Plan))
     return 1;
 
+  VPRegisterUsage R =
+      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
+
   // If we did not calculate the cost for VF (because the user selected the VF)
   // then we calculate the cost of VF here.
   if (LoopCost == 0) {
     if (VF.isScalar())
       LoopCost = CM.expectedCost(VF);
     else
-      LoopCost = cost(Plan, VF);
+      LoopCost = cost(Plan, VF, &R);
     assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
 
     // Loop body is free and there is no need for interleaving.
@@ -4701,8 +4701,6 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
       return 1;
   }
 
-  VPRegisterUsage R =
-      calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
   // We divide by these constants so assume that we have at least one
   // instruction that uses at least one register.
   for (auto &Pair : R.MaxLocalUsers) {
@@ -7027,13 +7025,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   return Cost;
 }
 
-InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
-                                               ElementCount VF) const {
+InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
+                                               VPRegisterUsage *RU) const {
   VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
   InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
 
   // Now compute and add the VPlan-based cost.
   Cost += Plan.cost(VF, CostCtx);
+
+  // Add the cost of spills due to excess register usage
+  if (CM.shouldConsiderRegPressureForVF(VF))
+    Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs);
+
 #ifndef NDEBUG
   unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
   LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
@@ -7233,9 +7236,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
                                P->vectorFactors().end());
 
     SmallVector<VPRegisterUsage, 8> RUs;
-    if (any_of(VFs, [this](ElementCount VF) {
-          return CM.shouldConsiderRegPressureForVF(VF);
-        }))
+    bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
+      return CM.shouldConsiderRegPressureForVF(VF);
+    });
+    if (ConsiderRegPressure)
       RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
 
     for (unsigned I = 0; I < VFs.size(); I++) {
@@ -7258,16 +7262,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
         continue;
       }
 
-      InstructionCost Cost = cost(*P, VF);
+      InstructionCost Cost =
+          cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
       VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
 
-      if (CM.shouldConsiderRegPressureForVF(VF) &&
-          RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
-        LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
-                          << VF << " because it uses too many registers\n");
-        continue;
-      }
-
       if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
         BestFactor = CurrentFactor;
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 8fbe7d93e6f45..a45b001ebb9ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -389,13 +389,28 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
   return Base::properlyDominates(ParentA, ParentB);
 }
 
-bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                                        unsigned OverrideMaxNumRegs) const {
-  return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
-    return LU.second > (OverrideMaxNumRegs > 0
-                            ? OverrideMaxNumRegs
-                            : TTI.getNumberOfRegisters(LU.first));
-  });
+InstructionCost VPRegisterUsage::spillCost(VPCostContext &Ctx,
+                                           unsigned OverrideMaxNumRegs) const {
+  InstructionCost Cost;
+  for (const auto &[RegClass, MaxUsers] : MaxLocalUsers) {
+    unsigned AvailableRegs = OverrideMaxNumRegs > 0
+                                 ? OverrideMaxNumRegs
+                                 : Ctx.TTI.getNumberOfRegisters(RegClass);
+    if (MaxUsers > AvailableRegs) {
+      // Assume that for each register used past what's available we get one
+      // spill and reload.
+      unsigned Spills = MaxUsers - AvailableRegs;
+      InstructionCost SpillCost =
+          Ctx.TTI.getRegisterClassSpillCost(RegClass, Ctx.CostKind) +
+          Ctx.TTI.getRegisterClassReloadCost(RegClass, Ctx.CostKind);
+      InstructionCost TotalCost = Spills * SpillCost;
+      LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from "
+                        << Spills << " spills of "
+                        << Ctx.TTI.getRegisterClassName(RegClass) << "\n");
+      Cost += TotalCost;
+    }
+  }
+  return Cost;
 }
 
 SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index dc4be4270f7f1..ab81fca62efee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -30,6 +30,9 @@ class VPlan;
 class Value;
 class TargetTransformInfo;
 class Type;
+class InstructionCost;
+
+struct VPCostContext;
 
 /// An analysis for type-inference for VPValues.
 /// It infers the scalar type for a given VPValue by bottom-up traversing
@@ -79,11 +82,11 @@ struct VPRegisterUsage {
   /// The key is ClassID of target-provided register class.
   SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
 
-  /// Check if any of the tracked live intervals exceeds the number of
-  /// available registers for the target. If non-zero, OverrideMaxNumRegs
+  /// Calculate the estimated cost of any spills due to using more registers
+  /// than the number available for the target. If non-zero, OverrideMaxNumRegs
   /// is used in place of the target's number of registers.
-  bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
-                         unsigned OverrideMaxNumRegs = 0) const;
+  InstructionCost spillCost(VPCostContext &Ctx,
+                            unsigned OverrideMaxNumRegs = 0) const;
 };
 
 /// Estimate the register usage for \p Plan and vectorization factors in \p VFs
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
index 8109d0683fe71..2addb840d47b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -1,16 +1,31 @@
 ; REQUIRES: asserts
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
-; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
+; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
+; The use of the dotp instruction means we never have an i32 vector, so we don't
+; get any spills normally and with a reduced number of registers the number of
+; spills is small enough that it doesn't prevent use of a larger VF.
 define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'dotp'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.3)
 ; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
-; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 14 (Estimated cost per lane: 1.8)
+; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.8)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
 entry:
   br label %for.body
 
@@ -24,8 +39,7 @@ for.body:                                         ; preds = %for.body, %entry
   %load.b = load i8, ptr %gep.b, align 1
   %ext.b = zext i8 %load.b to i32
   %mul = mul i32 %ext.b, %ext.a
-  %sub = sub i32 0, %mul
-  %add = add i32 %accum, %sub
+  %add = add i32 %accum, %mul
   %iv.next = add i64 %iv, 1
   %exitcond.not = icmp eq i64 %iv.next, 1024
   br i1 %exitcond.not, label %for.exit, label %for.body
@@ -34,4 +48,70 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+; The largest type used in the loop is small enough that we already consider all
+; VFs and maximize-bandwidth does nothing.
+define void @type_too_small(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
+; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
+; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.4)
+; CHECK: LV: Selecting VF: vscale x 16.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %add = add i8 %load.a, %load.b
+  store i8 %add, ptr %gep.a, align 1
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; With reduced number of registers the spills from high pressure are enough that
+; we use the same VF as if we hadn't maximized the bandwidth.
+define void @high_pressure(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
+;
+; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
+;
+; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.2)
+; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.3)
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+
+; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.5)
+; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
+; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.4)
+; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.a = getelementptr i32, ptr %a, i64 %iv
+  %load.a = load i32, ptr %gep.a, align 4
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %add = add i32 %load.a, %ext.b
+  store i32 %add, ptr %gep.a, align 4
+  %iv.next = add i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll
new file mode 100644
index 0000000000000..59b42990ee4e4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll
@@ -0,0 +1,266 @@
+; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOPRESSURE
+; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PRESSURE
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+; In this function the spills make it not profitable to vectorize if considering
+; register pressure.
+define void @spills_not_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n) {
+; CHECK-LABEL: LV: Checking a loop in 'spills_not_profitable'
+; CHECK: LV: Scalar loop costs: 86
+; CHECK-NOPRESSURE: Cost for VF 2: 394 (Estimated cost per lane: 197.0)
+; CHECK-NOPRESSURE: Cost for VF 4: 338 (Estimated cost per lane: 84.5)
+; CHECK-NOPRESSURE: LV: Selecting VF: 4
+; CHECK-PRESSURE: LV(REG): Cost of 50 from 25 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 2: 444 (Estimated cost per lane: 222.0)
+; CHECK-PRESSURE: LV(REG): Cost of 50 from 25 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 4: 388 (Estimated cost per lane: 97.0)
+; CHECK-PRESSURE: LV: Selecting VF: 1
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %for.body
+
+for.body:
+  %i = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %x4 = phi float [ %x4.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x3 = phi float [ %x3.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x2 = phi float [ %x2.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x1 = phi float [ %x1.next, %for.body ], [ 0.000000e+00, %entry ]
+  %x0 = phi float [ %x0.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc7 = phi float [ %acc7.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc6 = phi float [ %acc6.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc5 = phi float [ %acc5.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc4 = phi float [ %acc4.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc3 = phi float [ %acc3.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc2 = phi float [ %acc2.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc1 = phi float [ %acc1.next, %for.body ], [ 0.000000e+00, %entry ]
+  %acc0 = phi float [ %acc0.next, %for.body ], [ 0.000000e+00, %entry ]
+  %in1.addr = phi ptr [ %in1.addr.next, %for.body ], [ %in1, %entry ]
+  %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %entry ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %in1.addr, i32 4
+  %0 = load float, ptr %in1.addr, align 4
+  %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4
+  %1 = load float, ptr %in2.addr, align 4
+  %mul = fmul fast float %0, %x0
+  %add = fadd fast float %mul, %acc0
+  %mul2 = fmul fast float %0, %x1
+  %add3 = fadd fast float %mul2, %acc1
+  %mul4 = fmul fast float %0, %x2
+  %add5 = fadd fast float %mul4, %acc2
+  %mul6 = fmul fast float %0, %x3
+  %add7 = fadd fast float %mul6, %acc3
+  %mul8 = fmul fast float %0, %x4
+  %add9 = fadd fast float %mul8, %acc4
+  %mul10 = fmul fast float %1, %0
+  %add11 = fadd fast float %mul10, %acc7
+  %incdec.ptr12 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 8
+  %2 = load float, ptr %incdec.ptr, align 4
+  %incdec.ptr13 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8
+  %x0.next = load float, ptr %incdec.ptr1, align 4
+  %mul14 = fmul fast float %2, %x1
+  %add15 = fadd fast float %add, %mul14
+  %mul16 = fmul fast float %2, %x2
+  %add17 = fadd fast float %add3, %mul16
+  %mul18 = fmul fast float %2, %x3
+  %add19 = fadd fast float %add5, %mul18
+  %mul20 = fmul fast float %2, %x4
+  %add21 = fadd fast float %add7, %mul20
+  %mul22 = fmul fast float %2, %1
+  %add23 = fadd fast float %mul22, %acc6
+  %mul24 = fmul fast float %x0.next, %2
+  %add25 = fadd fast float %add11, %mul24
+  %incdec.ptr26 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 12
+  %4 = load float, ptr %incdec.ptr12, align 4
+  %incdec.ptr27 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 12
+  %x1.next = load float, ptr %incdec.ptr13, align 4
+  %mul28 = fmul fast float %4, %x2
+  %add29 = fadd fast float %add15, %mul28
+  %mul30 = fmul fast float %4, %x3
+  %add31 = fadd fast float %add17, %mul30
+  %mul32 = fmul fast float %4, %x4
+  %add33 = fadd fast float %add19, %mul32
+  %mul34 = fmul fast float %4, %1
+  %add35 = fadd fast float %mul34, %acc5
+  %mul36 = fmul fast float %4, %x0.next
+  %add37 = fadd fast float %add23, %mul36
+  %mul38 = fmul fast float %x1.next, %4
+  %add39 = fadd fast float %add25, %mul38
+  %incdec.ptr40 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 16
+  %6 = load float, ptr %incdec.ptr26, align 4
+  %incdec.ptr41 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 16
+  %x2.next = load float, ptr %incdec.ptr27, align 4
+  %mul42 = fmul fast float %6, %x3
+  %add43 = fadd fast float %add29, %mul42
+  %mul44 = fmul fast float %6, %x4
+  %acc1.next = fadd fast float %add31, %mul44
+  %mul46 = fmul fast float %6, %1
+  %add47 = fadd fast float %add9, %mul46
+  %mul48 = fmul fast float %6, %x0.next
+  %add49 = fadd fast float %add35, %mul48
+  %mul50 = fmul fast float %6, %x1.next
+  %add51 = fadd fast float %add37, %mul50
+  %mul52 = fmul fast float %x2.next, %6
+  %add53 = fadd fast float %add39, %mul52
+  %incdec.ptr54 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 20
+  %8 = load float, ptr %incdec.ptr40, align 4
+  %incdec.ptr55 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 20
+  %x3.next = load float, ptr %incdec.ptr41, align 4
+  %mul56 = fmul fast float %8, %x4
+  %acc0.next = fadd fast float %add43, %mul56
+  %mul58 = fmul fast float %8, %1
+  %add59 = fadd fast float %add21, %mul58
+  %mul60 = fmul fast float %8, %x0.next
+  %add61 = fadd fast float %add47, %mul60
+  %mul62 = fmul fast float %8, %x1.next
+  %add63 = fadd fast float %add49, %mul62
+  %mul64 = fmul fast float %8, %x2.next
+  %add65 = fadd fast float %add51, %mul64
+  %mul66 = fmul fast float %x3.next, %8
+  %add67 = fadd fast float %add53, %mul66
+  %in1.addr.next = getelementptr inbounds nuw i8, ptr %in1.addr, i32 24
+  %10 = load float, ptr %incdec.ptr54, align 4
+  %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 24
+  %x4.next = load float, ptr %incdec.ptr55, align 4
+  %mul70 = fmul fast float %10, %1
+  %acc2.next = fadd fast float %add33, %mul70
+  %mul72 = fmul fast float %10, %x0.next
+  %acc3.next = fadd fast float %add59, %mul72
+  %mul74 = fmul fast float %10, %x1.next
+  %acc4.next = fadd fast float %add61, %mul74
+  %mul76 = fmul fast float %10, %x2.next
+  %acc5.next = fadd fast float %add63, %mul76
+  %mul78 = fmul fast float %10, %x3.next
+  %acc6.next = fadd fast float %add65, %mul78
+  %mul80 = fmul fast float %x4.next, %10
+  %acc7.next = fadd fast float %add67, %mul80
+  %inc = add nuw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %acc0.exit = phi float [ 0.000000e+00, %entry ], [ %acc0.next, %for.body ]
+  %acc1.exit = phi float [ 0.000000e+00, %entry ], [ %acc1.next, %for.body ]
+  %acc2.exit = phi float [ 0.000000e+00, %entry ], [ %acc2.next, %for.body ]
+  %acc3.exit = phi float [ 0.000000e+00, %entry ], [ %acc3.next, %for.body ]
+  %acc4.exit = phi float [ 0.000000e+00, %entry ], [ %acc4.next, %for.body ]
+  %acc5.exit = phi float [ 0.000000e+00, %entry ], [ %acc5.next, %for.body ]
+  %acc6.exit = phi float [ 0.000000e+00, %entry ], [ %acc6.next, %for.body ]
+  %acc7.exit = phi float [ 0.000000e+00, %entry ], [ %acc7.next, %for.body ]
+  store float %acc0.exit, ptr %out, align 4
+  %arrayidx82 = getelementptr inbounds nuw i8, ptr %out, i32 4
+  store float %acc1.exit, ptr %arrayidx82, align 4
+  %arrayidx83 = getelementptr inbounds nuw i8, ptr %out, i32 8
+  store float %acc2.exit, ptr %arrayidx83, align 4
+  %arrayidx84 = getelementptr inbounds nuw i8, ptr %out, i32 12
+  store float %acc3.exit, ptr %arrayidx84, align 4
+  %arrayidx85 = getelementptr inbounds nuw i8, ptr %out, i32 16
+  store float %acc4.exit, ptr %arrayidx85, align 4
+  %arrayidx86 = getelementptr inbounds nuw i8, ptr %out, i32 20
+  store float %acc5.exit, ptr %arrayidx86, align 4
+  %arrayidx87 = getelementptr inbounds nuw i8, ptr %out, i32 24
+  store float %acc6.exit, ptr %arrayidx87, align 4
+  %arrayidx88 = getelementptr inbounds nuw i8, ptr %out, i32 28
+  store float %acc7.exit, ptr %arrayidx88, align 4
+  ret void
+}
+
+; In this function we have spills but it is still profitable to vectorize when
+; considering register pressure.
+define void @spills_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n, i32 %m) {
+; CHECK-LABEL: LV: Checking a loop in 'spills_profitable'
+; CHECK: LV: Scalar loop costs: 54
+; CHECK-NOPRESSURE: Cost for VF 2: 1530 (Estimated cost per lane: 765.0)
+; CHECK-NOPRESSURE: Cost for VF 4: 38 (Estimated cost per lane: 9.5)
+; CHECK-PRESSURE: LV(REG): Cost of 4 from 2 spills of Generic::ScalarRC
+; CHECK-PRESSURE-NEXT: Cost for VF 2: 1534 (Estimated cost per lane: 767.0)
+; CHECK-PRESSURE: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
+; CHECK-PRESSURE-NEXT: Cost for VF 4: 44 (Estimated cost per lane: 11.0)
+; CHECK: LV: Selecting VF: 4
+entry:
+  %cmp = icmp eq i32 %n, 0
+  br i1 %cmp, label %exit, label %for.body.preheader
+
+for.body.preheader:
+  %add.ptr3.idx = mul i32 %m, 12
+  %add.ptr3 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr3.idx
+  %add.ptr1.idx = shl i32 %m, 3
+  %add.ptr1 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr1.idx
+  %add.ptr = getelementptr inbounds nuw i32, ptr %in1, i32 %m
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %acc3 = phi i64 [ %acc3.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc2 = phi i64 [ %acc2.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc1 = phi i64 [ %acc1.next, %for.body ], [ 0, %for.body.preheader ]
+  %acc0 = phi i64 [ %acc0.next, %for.body ], [ 0, %for.body.preheader ]
+  %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %for.body.preheader ]
+  %px3 = phi ptr [ %px3.next, %for.body ], [ %add.ptr3, %for.body.preheader ]
+  %px2 = phi ptr [ %px2.next, %for.body ], [ %add.ptr1, %for.body.preheader ]
+  %px1 = phi ptr [ %px1.next, %for.body ], [ %add.ptr, %for.body.preheader ]
+  %px0 = phi ptr [ %px0.next, %for.body ], [ %in1, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4
+  %0 = load i32, ptr %in2.addr, align 4
+  %incdec.ptr4 = getelementptr inbounds nuw i8, ptr %px0, i32 4
+  %1 = load i32, ptr %px0, align 4
+  %incdec.ptr5 = getelementptr inbounds nuw i8, ptr %px1, i32 4
+  %2 = load i32, ptr %px1, align 4
+  %incdec.ptr6 = getelementptr inbounds nuw i8, ptr %px2, i32 4
+  %3 = load i32, ptr %px2, align 4
+  %incdec.ptr7 = getelementptr inbounds nuw i8, ptr %px3, i32 4
+  %4 = load i32, ptr %px3, align 4
+  %conv = sext i32 %1 to i64
+  %conv8 = sext i32 %0 to i64
+  %mul9 = mul nsw i64 %conv, %conv8
+  %add = add nsw i64 %mul9, %acc0
+  %conv10 = sext i32 %2 to i64
+  %mul12 = mul nsw i64 %conv10, %conv8
+  %add13 = add nsw i64 %mul12, %acc1
+  %conv14 = sext i32 %3 to i64
+  %mul16 = mul nsw i64 %conv14, %conv8
+  %add17 = add nsw i64 %mul16, %acc2
+  %conv18 = sext i32 %4 to i64
+  %mul20 = mul nsw i64 %conv18, %conv8
+  %add21 = add nsw i64 %mul20, %acc3
+  %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8
+  %5 = load i32, ptr %incdec.ptr, align 4
+  %px0.next = getelementptr inbounds nuw i8, ptr %px0, i32 8
+  %6 = load i32, ptr %incdec.ptr4, align 4
+  %px1.next = getelementptr inbounds nuw i8, ptr %px1, i32 8
+  %7 = load i32, ptr %incdec.ptr5, align 4
+  %px2.next = getelementptr inbounds nuw i8, ptr %px2, i32 8
+  %8 = load i32, ptr %incdec.ptr6, align 4
+  %px3.next = getelementptr inbounds nuw i8, ptr %px3, i32 8
+  %9 = load i32, ptr %incdec.ptr7, align 4
+  %conv27 = sext i32 %6 to i64
+  %conv28 = sext i32 %5 to i64
+  %mul29 = mul nsw i64 %conv27, %conv28
+  %acc0.next = add nsw i64 %add, %mul29
+  %conv31 = sext i32 %7 to i64
+  %mul33 = mul nsw i64 %conv31, %conv28
+  %acc1.next = add nsw i64 %add13, %mul33
+  %conv35 = sext i32 %8 to i64
+  %mul37 = mul nsw i64 %conv35, %conv28
+  %acc2.next = add nsw i64 %add17, %mul37
+  %conv39 = sext i32 %9 to i64
+  %mul41 = mul nsw i64 %conv39, %conv28
+  %acc3.next = add nsw i64 %add21, %mul41
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %acc0.exit = phi i64 [ 0, %entry ], [ %acc0.next, %for.body ]
+  %acc1.exit = phi i64 [ 0, %entry ], [ %acc1.next, %for.body ]
+  %acc2.exit = phi i64 [ 0, %entry ], [ %acc2.next, %for.body ]
+  %acc3.exit = phi i64 [ 0, %entry ], [ %acc3.next, %for.body ]
+  store i64 %acc0.exit, ptr %out, align 8
+  %arrayidx43 = getelementptr inbounds nuw i8, ptr %out, i32 8
+  store i64 %acc1.exit, ptr %arrayidx43, align 8
+  %arrayidx44 = getelementptr inbounds nuw i8, ptr %out, i32 16
+  store i64 %acc2.exit, ptr %arrayidx44, align 8
+  %arrayidx45 = getelementptr inbounds nuw i8, ptr %out, i32 24
+  store i64 %acc3.exit, ptr %arrayidx45, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
index de49337c185ac..3654e82423317 100644
--- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll
@@ -13,14 +13,14 @@ define void @bar(ptr %A, i32 signext %n) {
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers
 ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
-; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
+; CHECK-SCALAR:      LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class
 ; CHECK-VECTOR:      LV(REG): Found max usage: 2 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers
 ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item
 ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers
-; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class
+; CHECK-VECTOR:      LV: The target has 30 registers of LoongArch::GPRRC register class
 ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class
 
 entry: