diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index a013122df5f06..799452094b7b9 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1328,6 +1328,16 @@ class TargetTransformInfo { /// \return the target-provided register class name LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const; + /// \return the cost of spilling a register in the target-provided register + /// class to the stack. + LLVM_ABI InstructionCost + getRegisterClassSpillCost(unsigned ClassID, TargetCostKind CostKind) const; + + /// \return the cost of reloading a register in the target-provided register + /// class from the stack. + LLVM_ABI InstructionCost + getRegisterClassReloadCost(unsigned ClassID, TargetCostKind CostKind) const; + enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector }; /// \return The width of the largest scalar or vector register type. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 6d27cabf404f8..6cf1aa916c25f 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -628,6 +628,18 @@ class TargetTransformInfoImplBase { } } + virtual InstructionCost + getRegisterClassSpillCost(unsigned ClassID, + TTI::TargetCostKind CostKind) const { + return TTI::TCC_Basic; + } + + virtual InstructionCost + getRegisterClassReloadCost(unsigned ClassID, + TTI::TargetCostKind CostKind) const { + return TTI::TCC_Basic; + } + virtual TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { return TypeSize::getFixed(32); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 19785204ed2b3..204738cd714a0 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -815,6 +815,16 @@ const char *TargetTransformInfo::getRegisterClassName(unsigned ClassID) const { return TTIImpl->getRegisterClassName(ClassID); } +InstructionCost TargetTransformInfo::getRegisterClassSpillCost( + unsigned ClassID, TTI::TargetCostKind CostKind) const { + return TTIImpl->getRegisterClassSpillCost(ClassID, CostKind); +} + +InstructionCost TargetTransformInfo::getRegisterClassReloadCost( + unsigned ClassID, TTI::TargetCostKind CostKind) const { + return TTIImpl->getRegisterClassReloadCost(ClassID, CostKind); +} + TypeSize TargetTransformInfo::getRegisterBitWidth( TargetTransformInfo::RegisterKind K) const { return TTIImpl->getRegisterBitWidth(K); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 44d4d92d4a7e2..56e94782abd07 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -45,6 +45,7 @@ class OptimizationRemarkEmitter; class TargetTransformInfo; class TargetLibraryInfo; class VPRecipeBuilder; +struct VPRegisterUsage; struct VFRange; extern cl::opt EnableVPlanNativePath; @@ -497,7 +498,7 @@ class LoopVectorizationPlanner { /// /// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has /// been retired. - InstructionCost cost(VPlan &Plan, ElementCount VF) const; + InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const; /// Precompute costs for certain instructions using the legacy cost model. The /// function is used to bring up the VPlan-based cost model to initially avoid diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index abac45b265d10..492e716fd6ad2 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4247,13 +4247,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (VF.isScalar()) continue; - /// If the register pressure needs to be considered for VF, - /// don't consider the VF as valid if it exceeds the number - /// of registers for the target. - if (CM.shouldConsiderRegPressureForVF(VF) && - RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) - continue; - InstructionCost C = CM.expectedCost(VF); // Add on other costs that are modelled in VPlan, but not in the legacy @@ -4302,6 +4295,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { } } + // Add the cost of any spills due to excess register usage + if (CM.shouldConsiderRegPressureForVF(VF)) + C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs); + VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost); unsigned Width = estimateElementCount(Candidate.Width, CM.getVScaleForTuning()); @@ -4687,13 +4684,16 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, if (hasFindLastReductionPhi(Plan)) return 1; + VPRegisterUsage R = + calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0]; + // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. if (LoopCost == 0) { if (VF.isScalar()) LoopCost = CM.expectedCost(VF); else - LoopCost = cost(Plan, VF); + LoopCost = cost(Plan, VF, &R); assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. @@ -4701,8 +4701,6 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF, return 1; } - VPRegisterUsage R = - calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto &Pair : R.MaxLocalUsers) { @@ -7027,13 +7025,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF, return Cost; } -InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, - ElementCount VF) const { +InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF, + VPRegisterUsage *RU) const { VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop); InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx); // Now compute and add the VPlan-based cost. Cost += Plan.cost(VF, CostCtx); + + // Add the cost of spills due to excess register usage + if (CM.shouldConsiderRegPressureForVF(VF)) + Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs); + #ifndef NDEBUG unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning()); LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost @@ -7233,9 +7236,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { P->vectorFactors().end()); SmallVector RUs; - if (any_of(VFs, [this](ElementCount VF) { - return CM.shouldConsiderRegPressureForVF(VF); - })) + bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) { + return CM.shouldConsiderRegPressureForVF(VF); + }); + if (ConsiderRegPressure) RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore); for (unsigned I = 0; I < VFs.size(); I++) { @@ -7258,16 +7262,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { continue; } - InstructionCost Cost = cost(*P, VF); + InstructionCost Cost = + cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.shouldConsiderRegPressureForVF(VF) && - RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { - LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " - << VF << " because it uses too many registers\n"); - continue; - } - if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail())) BestFactor = CurrentFactor; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 8fbe7d93e6f45..a45b001ebb9ba 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -389,13 +389,28 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, return Base::properlyDominates(ParentA, ParentB); } -bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, - unsigned OverrideMaxNumRegs) const { - return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { - return LU.second > (OverrideMaxNumRegs > 0 - ? OverrideMaxNumRegs - : TTI.getNumberOfRegisters(LU.first)); - }); +InstructionCost VPRegisterUsage::spillCost(VPCostContext &Ctx, + unsigned OverrideMaxNumRegs) const { + InstructionCost Cost; + for (const auto &[RegClass, MaxUsers] : MaxLocalUsers) { + unsigned AvailableRegs = OverrideMaxNumRegs > 0 + ? OverrideMaxNumRegs + : Ctx.TTI.getNumberOfRegisters(RegClass); + if (MaxUsers > AvailableRegs) { + // Assume that for each register used past what's available we get one + // spill and reload. + unsigned Spills = MaxUsers - AvailableRegs; + InstructionCost SpillCost = + Ctx.TTI.getRegisterClassSpillCost(RegClass, Ctx.CostKind) + + Ctx.TTI.getRegisterClassReloadCost(RegClass, Ctx.CostKind); + InstructionCost TotalCost = Spills * SpillCost; + LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from " + << Spills << " spills of " + << Ctx.TTI.getRegisterClassName(RegClass) << "\n"); + Cost += TotalCost; + } + } + return Cost; } SmallVector llvm::calculateRegisterUsageForPlan( diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index dc4be4270f7f1..ab81fca62efee 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -30,6 +30,9 @@ class VPlan; class Value; class TargetTransformInfo; class Type; +class InstructionCost; + +struct VPCostContext; /// An analysis for type-inference for VPValues. /// It infers the scalar type for a given VPValue by bottom-up traversing @@ -79,11 +82,11 @@ struct VPRegisterUsage { /// The key is ClassID of target-provided register class. SmallMapVector MaxLocalUsers; - /// Check if any of the tracked live intervals exceeds the number of - /// available registers for the target. If non-zero, OverrideMaxNumRegs + /// Calculate the estimated cost of any spills due to using more registers + /// than the number available for the target. If non-zero, OverrideMaxNumRegs /// is used in place of the target's number of registers. - bool exceedsMaxNumRegs(const TargetTransformInfo &TTI, - unsigned OverrideMaxNumRegs = 0) const; + InstructionCost spillCost(VPCostContext &Ctx, + unsigned OverrideMaxNumRegs = 0) const; }; /// Estimate the register usage for \p Plan and vectorization factors in \p VFs diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll index 8109d0683fe71..2addb840d47b9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll @@ -1,16 +1,31 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP -; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64-none-unknown-elf" +; The use of the dotp instruction means we never have an i32 vector, so we don't +; get any spills normally and with a reduced number of registers the number of +; spills is small enough that it doesn't prevent use of a larger VF. define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-LABEL: LV: Checking a loop in 'dotp' +; +; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-NOMAX: LV: Selecting VF: vscale x 4. +; +; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-REGS-VP: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8) +; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.3) ; CHECK-REGS-VP: LV: Selecting VF: vscale x 16. ; -; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers -; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers -; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4. +; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC +; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 14 (Estimated cost per lane: 1.8) +; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC +; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.8) +; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16. entry: br label %for.body @@ -24,8 +39,7 @@ for.body: ; preds = %for.body, %entry %load.b = load i8, ptr %gep.b, align 1 %ext.b = zext i8 %load.b to i32 %mul = mul i32 %ext.b, %ext.a - %sub = sub i32 0, %mul - %add = add i32 %accum, %sub + %add = add i32 %accum, %mul %iv.next = add i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, 1024 br i1 %exitcond.not, label %for.exit, label %for.body @@ -34,4 +48,70 @@ for.exit: ; preds = %for.body ret i32 %add } +; The largest type used in the loop is small enough that we already consider all +; VFs and maximize-bandwidth does nothing. +define void @type_too_small(ptr %a, ptr %b) #0 { +; CHECK-LABEL: LV: Checking a loop in 'type_too_small' +; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8) +; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.4) +; CHECK: LV: Selecting VF: vscale x 16. +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.a, %load.b + store i8 %add, ptr %gep.a, align 1 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1024 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + +; With reduced number of registers the spills from high pressure are enough that +; we use the same VF as if we hadn't maximized the bandwidth. +define void @high_pressure(ptr %a, ptr %b) #0 { +; CHECK-LABEL: LV: Checking a loop in 'high_pressure' +; +; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-NOMAX: LV: Selecting VF: vscale x 4. +; +; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.2) +; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.3) +; CHECK-REGS-VP: LV: Selecting VF: vscale x 8. + +; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5) +; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC +; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.5) +; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC +; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.4) +; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4. +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.a = getelementptr i32, ptr %a, i64 %iv + %load.a = load i32, ptr %gep.a, align 4 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %add = add i32 %load.a, %ext.b + store i32 %add, ptr %gep.a, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1024 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} + attributes #0 = { vscale_range(1,16) "target-features"="+sve" } diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll new file mode 100644 index 0000000000000..59b42990ee4e4 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reg-pressure-spills.ll @@ -0,0 +1,266 @@ +; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=false %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOPRESSURE +; RUN: opt -mcpu=cortex-m55 -passes=loop-vectorize -disable-output -debug-only=loop-vectorize,vplan -vectorizer-consider-reg-pressure=true %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PRESSURE + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv8.1m.main-unknown-none-eabihf" + +; In this function the spills make it not profitable to vectorize if considering +; register pressure. +define void @spills_not_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n) { +; CHECK-LABEL: LV: Checking a loop in 'spills_not_profitable' +; CHECK: LV: Scalar loop costs: 86 +; CHECK-NOPRESSURE: Cost for VF 2: 394 (Estimated cost per lane: 197.0) +; CHECK-NOPRESSURE: Cost for VF 4: 338 (Estimated cost per lane: 84.5) +; CHECK-NOPRESSURE: LV: Selecting VF: 4 +; CHECK-PRESSURE: LV(REG): Cost of 50 from 25 spills of Generic::VectorRC +; CHECK-PRESSURE-NEXT: Cost for VF 2: 444 (Estimated cost per lane: 222.0) +; CHECK-PRESSURE: LV(REG): Cost of 50 from 25 spills of Generic::VectorRC +; CHECK-PRESSURE-NEXT: Cost for VF 4: 388 (Estimated cost per lane: 97.0) +; CHECK-PRESSURE: LV: Selecting VF: 1 +entry: + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %exit, label %for.body + +for.body: + %i = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %x4 = phi float [ %x4.next, %for.body ], [ 0.000000e+00, %entry ] + %x3 = phi float [ %x3.next, %for.body ], [ 0.000000e+00, %entry ] + %x2 = phi float [ %x2.next, %for.body ], [ 0.000000e+00, %entry ] + %x1 = phi float [ %x1.next, %for.body ], [ 0.000000e+00, %entry ] + %x0 = phi float [ %x0.next, %for.body ], [ 0.000000e+00, %entry ] + %acc7 = phi float [ %acc7.next, %for.body ], [ 0.000000e+00, %entry ] + %acc6 = phi float [ %acc6.next, %for.body ], [ 0.000000e+00, %entry ] + %acc5 = phi float [ %acc5.next, %for.body ], [ 0.000000e+00, %entry ] + %acc4 = phi float [ %acc4.next, %for.body ], [ 0.000000e+00, %entry ] + %acc3 = phi float [ %acc3.next, %for.body ], [ 0.000000e+00, %entry ] + %acc2 = phi float [ %acc2.next, %for.body ], [ 0.000000e+00, %entry ] + %acc1 = phi float [ %acc1.next, %for.body ], [ 0.000000e+00, %entry ] + %acc0 = phi float [ %acc0.next, %for.body ], [ 0.000000e+00, %entry ] + %in1.addr = phi ptr [ %in1.addr.next, %for.body ], [ %in1, %entry ] + %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %entry ] + %incdec.ptr = getelementptr inbounds nuw i8, ptr %in1.addr, i32 4 + %0 = load float, ptr %in1.addr, align 4 + %incdec.ptr1 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4 + %1 = load float, ptr %in2.addr, align 4 + %mul = fmul fast float %0, %x0 + %add = fadd fast float %mul, %acc0 + %mul2 = fmul fast float %0, %x1 + %add3 = fadd fast float %mul2, %acc1 + %mul4 = fmul fast float %0, %x2 + %add5 = fadd fast float %mul4, %acc2 + %mul6 = fmul fast float %0, %x3 + %add7 = fadd fast float %mul6, %acc3 + %mul8 = fmul fast float %0, %x4 + %add9 = fadd fast float %mul8, %acc4 + %mul10 = fmul fast float %1, %0 + %add11 = fadd fast float %mul10, %acc7 + %incdec.ptr12 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 8 + %2 = load float, ptr %incdec.ptr, align 4 + %incdec.ptr13 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8 + %x0.next = load float, ptr %incdec.ptr1, align 4 + %mul14 = fmul fast float %2, %x1 + %add15 = fadd fast float %add, %mul14 + %mul16 = fmul fast float %2, %x2 + %add17 = fadd fast float %add3, %mul16 + %mul18 = fmul fast float %2, %x3 + %add19 = fadd fast float %add5, %mul18 + %mul20 = fmul fast float %2, %x4 + %add21 = fadd fast float %add7, %mul20 + %mul22 = fmul fast float %2, %1 + %add23 = fadd fast float %mul22, %acc6 + %mul24 = fmul fast float %x0.next, %2 + %add25 = fadd fast float %add11, %mul24 + %incdec.ptr26 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 12 + %4 = load float, ptr %incdec.ptr12, align 4 + %incdec.ptr27 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 12 + %x1.next = load float, ptr %incdec.ptr13, align 4 + %mul28 = fmul fast float %4, %x2 + %add29 = fadd fast float %add15, %mul28 + %mul30 = fmul fast float %4, %x3 + %add31 = fadd fast float %add17, %mul30 + %mul32 = fmul fast float %4, %x4 + %add33 = fadd fast float %add19, %mul32 + %mul34 = fmul fast float %4, %1 + %add35 = fadd fast float %mul34, %acc5 + %mul36 = fmul fast float %4, %x0.next + %add37 = fadd fast float %add23, %mul36 + %mul38 = fmul fast float %x1.next, %4 + %add39 = fadd fast float %add25, %mul38 + %incdec.ptr40 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 16 + %6 = load float, ptr %incdec.ptr26, align 4 + %incdec.ptr41 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 16 + %x2.next = load float, ptr %incdec.ptr27, align 4 + %mul42 = fmul fast float %6, %x3 + %add43 = fadd fast float %add29, %mul42 + %mul44 = fmul fast float %6, %x4 + %acc1.next = fadd fast float %add31, %mul44 + %mul46 = fmul fast float %6, %1 + %add47 = fadd fast float %add9, %mul46 + %mul48 = fmul fast float %6, %x0.next + %add49 = fadd fast float %add35, %mul48 + %mul50 = fmul fast float %6, %x1.next + %add51 = fadd fast float %add37, %mul50 + %mul52 = fmul fast float %x2.next, %6 + %add53 = fadd fast float %add39, %mul52 + %incdec.ptr54 = getelementptr inbounds nuw i8, ptr %in1.addr, i32 20 + %8 = load float, ptr %incdec.ptr40, align 4 + %incdec.ptr55 = getelementptr inbounds nuw i8, ptr %in2.addr, i32 20 + %x3.next = load float, ptr %incdec.ptr41, align 4 + %mul56 = fmul fast float %8, %x4 + %acc0.next = fadd fast float %add43, %mul56 + %mul58 = fmul fast float %8, %1 + %add59 = fadd fast float %add21, %mul58 + %mul60 = fmul fast float %8, %x0.next + %add61 = fadd fast float %add47, %mul60 + %mul62 = fmul fast float %8, %x1.next + %add63 = fadd fast float %add49, %mul62 + %mul64 = fmul fast float %8, %x2.next + %add65 = fadd fast float %add51, %mul64 + %mul66 = fmul fast float %x3.next, %8 + %add67 = fadd fast float %add53, %mul66 + %in1.addr.next = getelementptr inbounds nuw i8, ptr %in1.addr, i32 24 + %10 = load float, ptr %incdec.ptr54, align 4 + %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 24 + %x4.next = load float, ptr %incdec.ptr55, align 4 + %mul70 = fmul fast float %10, %1 + %acc2.next = fadd fast float %add33, %mul70 + %mul72 = fmul fast float %10, %x0.next + %acc3.next = fadd fast float %add59, %mul72 + %mul74 = fmul fast float %10, %x1.next + %acc4.next = fadd fast float %add61, %mul74 + %mul76 = fmul fast float %10, %x2.next + %acc5.next = fadd fast float %add63, %mul76 + %mul78 = fmul fast float %10, %x3.next + %acc6.next = fadd fast float %add65, %mul78 + %mul80 = fmul fast float %x4.next, %10 + %acc7.next = fadd fast float %add67, %mul80 + %inc = add nuw i32 %i, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %for.body + +exit: + %acc0.exit = phi float [ 0.000000e+00, %entry ], [ %acc0.next, %for.body ] + %acc1.exit = phi float [ 0.000000e+00, %entry ], [ %acc1.next, %for.body ] + %acc2.exit = phi float [ 0.000000e+00, %entry ], [ %acc2.next, %for.body ] + %acc3.exit = phi float [ 0.000000e+00, %entry ], [ %acc3.next, %for.body ] + %acc4.exit = phi float [ 0.000000e+00, %entry ], [ %acc4.next, %for.body ] + %acc5.exit = phi float [ 0.000000e+00, %entry ], [ %acc5.next, %for.body ] + %acc6.exit = phi float [ 0.000000e+00, %entry ], [ %acc6.next, %for.body ] + %acc7.exit = phi float [ 0.000000e+00, %entry ], [ %acc7.next, %for.body ] + store float %acc0.exit, ptr %out, align 4 + %arrayidx82 = getelementptr inbounds nuw i8, ptr %out, i32 4 + store float %acc1.exit, ptr %arrayidx82, align 4 + %arrayidx83 = getelementptr inbounds nuw i8, ptr %out, i32 8 + store float %acc2.exit, ptr %arrayidx83, align 4 + %arrayidx84 = getelementptr inbounds nuw i8, ptr %out, i32 12 + store float %acc3.exit, ptr %arrayidx84, align 4 + %arrayidx85 = getelementptr inbounds nuw i8, ptr %out, i32 16 + store float %acc4.exit, ptr %arrayidx85, align 4 + %arrayidx86 = getelementptr inbounds nuw i8, ptr %out, i32 20 + store float %acc5.exit, ptr %arrayidx86, align 4 + %arrayidx87 = getelementptr inbounds nuw i8, ptr %out, i32 24 + store float %acc6.exit, ptr %arrayidx87, align 4 + %arrayidx88 = getelementptr inbounds nuw i8, ptr %out, i32 28 + store float %acc7.exit, ptr %arrayidx88, align 4 + ret void +} + +; In this function we have spills but it is still profitable to vectorize when +; considering register pressure. +define void @spills_profitable(ptr %in1, ptr %in2, ptr %out, i32 %n, i32 %m) { +; CHECK-LABEL: LV: Checking a loop in 'spills_profitable' +; CHECK: LV: Scalar loop costs: 54 +; CHECK-NOPRESSURE: Cost for VF 2: 1530 (Estimated cost per lane: 765.0) +; CHECK-NOPRESSURE: Cost for VF 4: 38 (Estimated cost per lane: 9.5) +; CHECK-PRESSURE: LV(REG): Cost of 4 from 2 spills of Generic::ScalarRC +; CHECK-PRESSURE-NEXT: Cost for VF 2: 1534 (Estimated cost per lane: 767.0) +; CHECK-PRESSURE: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC +; CHECK-PRESSURE-NEXT: Cost for VF 4: 44 (Estimated cost per lane: 11.0) +; CHECK: LV: Selecting VF: 4 +entry: + %cmp = icmp eq i32 %n, 0 + br i1 %cmp, label %exit, label %for.body.preheader + +for.body.preheader: + %add.ptr3.idx = mul i32 %m, 12 + %add.ptr3 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr3.idx + %add.ptr1.idx = shl i32 %m, 3 + %add.ptr1 = getelementptr inbounds nuw i8, ptr %in1, i32 %add.ptr1.idx + %add.ptr = getelementptr inbounds nuw i32, ptr %in1, i32 %m + br label %for.body + +for.body: + %i = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %acc3 = phi i64 [ %acc3.next, %for.body ], [ 0, %for.body.preheader ] + %acc2 = phi i64 [ %acc2.next, %for.body ], [ 0, %for.body.preheader ] + %acc1 = phi i64 [ %acc1.next, %for.body ], [ 0, %for.body.preheader ] + %acc0 = phi i64 [ %acc0.next, %for.body ], [ 0, %for.body.preheader ] + %in2.addr = phi ptr [ %in2.addr.next, %for.body ], [ %in2, %for.body.preheader ] + %px3 = phi ptr [ %px3.next, %for.body ], [ %add.ptr3, %for.body.preheader ] + %px2 = phi ptr [ %px2.next, %for.body ], [ %add.ptr1, %for.body.preheader ] + %px1 = phi ptr [ %px1.next, %for.body ], [ %add.ptr, %for.body.preheader ] + %px0 = phi ptr [ %px0.next, %for.body ], [ %in1, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds nuw i8, ptr %in2.addr, i32 4 + %0 = load i32, ptr %in2.addr, align 4 + %incdec.ptr4 = getelementptr inbounds nuw i8, ptr %px0, i32 4 + %1 = load i32, ptr %px0, align 4 + %incdec.ptr5 = getelementptr inbounds nuw i8, ptr %px1, i32 4 + %2 = load i32, ptr %px1, align 4 + %incdec.ptr6 = getelementptr inbounds nuw i8, ptr %px2, i32 4 + %3 = load i32, ptr %px2, align 4 + %incdec.ptr7 = getelementptr inbounds nuw i8, ptr %px3, i32 4 + %4 = load i32, ptr %px3, align 4 + %conv = sext i32 %1 to i64 + %conv8 = sext i32 %0 to i64 + %mul9 = mul nsw i64 %conv, %conv8 + %add = add nsw i64 %mul9, %acc0 + %conv10 = sext i32 %2 to i64 + %mul12 = mul nsw i64 %conv10, %conv8 + %add13 = add nsw i64 %mul12, %acc1 + %conv14 = sext i32 %3 to i64 + %mul16 = mul nsw i64 %conv14, %conv8 + %add17 = add nsw i64 %mul16, %acc2 + %conv18 = sext i32 %4 to i64 + %mul20 = mul nsw i64 %conv18, %conv8 + %add21 = add nsw i64 %mul20, %acc3 + %in2.addr.next = getelementptr inbounds nuw i8, ptr %in2.addr, i32 8 + %5 = load i32, ptr %incdec.ptr, align 4 + %px0.next = getelementptr inbounds nuw i8, ptr %px0, i32 8 + %6 = load i32, ptr %incdec.ptr4, align 4 + %px1.next = getelementptr inbounds nuw i8, ptr %px1, i32 8 + %7 = load i32, ptr %incdec.ptr5, align 4 + %px2.next = getelementptr inbounds nuw i8, ptr %px2, i32 8 + %8 = load i32, ptr %incdec.ptr6, align 4 + %px3.next = getelementptr inbounds nuw i8, ptr %px3, i32 8 + %9 = load i32, ptr %incdec.ptr7, align 4 + %conv27 = sext i32 %6 to i64 + %conv28 = sext i32 %5 to i64 + %mul29 = mul nsw i64 %conv27, %conv28 + %acc0.next = add nsw i64 %add, %mul29 + %conv31 = sext i32 %7 to i64 + %mul33 = mul nsw i64 %conv31, %conv28 + %acc1.next = add nsw i64 %add13, %mul33 + %conv35 = sext i32 %8 to i64 + %mul37 = mul nsw i64 %conv35, %conv28 + %acc2.next = add nsw i64 %add17, %mul37 + %conv39 = sext i32 %9 to i64 + %mul41 = mul nsw i64 %conv39, %conv28 + %acc3.next = add nsw i64 %add21, %mul41 + %inc = add nuw nsw i32 %i, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %for.body + +exit: + %acc0.exit = phi i64 [ 0, %entry ], [ %acc0.next, %for.body ] + %acc1.exit = phi i64 [ 0, %entry ], [ %acc1.next, %for.body ] + %acc2.exit = phi i64 [ 0, %entry ], [ %acc2.next, %for.body ] + %acc3.exit = phi i64 [ 0, %entry ], [ %acc3.next, %for.body ] + store i64 %acc0.exit, ptr %out, align 8 + %arrayidx43 = getelementptr inbounds nuw i8, ptr %out, i32 8 + store i64 %acc1.exit, ptr %arrayidx43, align 8 + %arrayidx44 = getelementptr inbounds nuw i8, ptr %out, i32 16 + store i64 %acc2.exit, ptr %arrayidx44, align 8 + %arrayidx45 = getelementptr inbounds nuw i8, ptr %out, i32 24 + store i64 %acc3.exit, ptr %arrayidx45, align 8 + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll index de49337c185ac..3654e82423317 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/reg-usage.ll @@ -13,14 +13,14 @@ define void @bar(ptr %A, i32 signext %n) { ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::FPRRC, 1 registers ; CHECK-SCALAR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-SCALAR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers -; CHECK-SCALAR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class +; CHECK-SCALAR: LV: The target has 30 registers of LoongArch::GPRRC register class ; CHECK-SCALAR-NEXT: LV: The target has 32 registers of LoongArch::FPRRC register class ; CHECK-VECTOR: LV(REG): Found max usage: 2 item ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 2 registers ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::VRRC, 2 registers ; CHECK-VECTOR-NEXT: LV(REG): Found invariant usage: 1 item ; CHECK-VECTOR-NEXT: LV(REG): RegisterClass: LoongArch::GPRRC, 1 registers -; CHECK-VECTOR-NEXT: LV: The target has 30 registers of LoongArch::GPRRC register class +; CHECK-VECTOR: LV: The target has 30 registers of LoongArch::GPRRC register class ; CHECK-VECTOR-NEXT: LV: The target has 32 registers of LoongArch::VRRC register class entry: