Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1328,6 +1328,16 @@ class TargetTransformInfo {
/// \return the target-provided register class name
LLVM_ABI const char *getRegisterClassName(unsigned ClassID) const;

/// \return the cost of spilling a register in the target-provided register
/// class to the stack.
LLVM_ABI InstructionCost
getRegisterClassSpillCost(unsigned ClassID, TargetCostKind CostKind) const;

/// \return the cost of reloading a register in the target-provided register
/// class from the stack.
LLVM_ABI InstructionCost
getRegisterClassReloadCost(unsigned ClassID, TargetCostKind CostKind) const;

enum RegisterKind { RGK_Scalar, RGK_FixedWidthVector, RGK_ScalableVector };

/// \return The width of the largest scalar or vector register type.
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,18 @@ class TargetTransformInfoImplBase {
}
}

virtual InstructionCost
getRegisterClassSpillCost(unsigned ClassID,
TTI::TargetCostKind CostKind) const {
return TTI::TCC_Basic;
}

virtual InstructionCost
getRegisterClassReloadCost(unsigned ClassID,
TTI::TargetCostKind CostKind) const {
return TTI::TCC_Basic;
}

virtual TypeSize
getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
return TypeSize::getFixed(32);
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -815,6 +815,16 @@ const char *TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
return TTIImpl->getRegisterClassName(ClassID);
}

InstructionCost TargetTransformInfo::getRegisterClassSpillCost(
unsigned ClassID, TTI::TargetCostKind CostKind) const {
return TTIImpl->getRegisterClassSpillCost(ClassID, CostKind);
}

InstructionCost TargetTransformInfo::getRegisterClassReloadCost(
unsigned ClassID, TTI::TargetCostKind CostKind) const {
return TTIImpl->getRegisterClassReloadCost(ClassID, CostKind);
}

TypeSize TargetTransformInfo::getRegisterBitWidth(
TargetTransformInfo::RegisterKind K) const {
return TTIImpl->getRegisterBitWidth(K);
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ class OptimizationRemarkEmitter;
class TargetTransformInfo;
class TargetLibraryInfo;
class VPRecipeBuilder;
struct VPRegisterUsage;
struct VFRange;

extern cl::opt<bool> EnableVPlanNativePath;
Expand Down Expand Up @@ -497,7 +498,7 @@ class LoopVectorizationPlanner {
///
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
/// been retired.
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
InstructionCost cost(VPlan &Plan, ElementCount VF, VPRegisterUsage *RU) const;

/// Precompute costs for certain instructions using the legacy cost model. The
/// function is used to bring up the VPlan-based cost model to initially avoid
Expand Down
44 changes: 21 additions & 23 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4247,13 +4247,6 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
if (VF.isScalar())
continue;

/// If the register pressure needs to be considered for VF,
/// don't consider the VF as valid if it exceeds the number
/// of registers for the target.
if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
continue;

InstructionCost C = CM.expectedCost(VF);

// Add on other costs that are modelled in VPlan, but not in the legacy
Expand Down Expand Up @@ -4302,6 +4295,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
}
}

// Add the cost of any spills due to excess register usage
if (CM.shouldConsiderRegPressureForVF(VF))
C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);

VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
unsigned Width =
estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
Expand Down Expand Up @@ -4687,22 +4684,23 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
if (hasFindLastReductionPhi(Plan))
return 1;

VPRegisterUsage R =
calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];

// If we did not calculate the cost for VF (because the user selected the VF)
// then we calculate the cost of VF here.
if (LoopCost == 0) {
if (VF.isScalar())
LoopCost = CM.expectedCost(VF);
else
LoopCost = cost(Plan, VF);
LoopCost = cost(Plan, VF, &R);
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");

// Loop body is free and there is no need for interleaving.
if (LoopCost == 0)
return 1;
}

VPRegisterUsage R =
calculateRegisterUsageForPlan(Plan, {VF}, TTI, CM.ValuesToIgnore)[0];
// We divide by these constants so assume that we have at least one
// instruction that uses at least one register.
for (auto &Pair : R.MaxLocalUsers) {
Expand Down Expand Up @@ -7027,13 +7025,18 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
return Cost;
}

InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
ElementCount VF) const {
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
VPRegisterUsage *RU) const {
VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);

// Add the cost of spills due to excess register usage
if (CM.shouldConsiderRegPressureForVF(VF))
Cost += RU->spillCost(CostCtx, ForceTargetNumVectorRegs);

#ifndef NDEBUG
unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
Expand Down Expand Up @@ -7233,9 +7236,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
P->vectorFactors().end());

SmallVector<VPRegisterUsage, 8> RUs;
if (any_of(VFs, [this](ElementCount VF) {
return CM.shouldConsiderRegPressureForVF(VF);
}))
bool ConsiderRegPressure = any_of(VFs, [this](ElementCount VF) {
return CM.shouldConsiderRegPressureForVF(VF);
});
if (ConsiderRegPressure)
RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);

for (unsigned I = 0; I < VFs.size(); I++) {
Expand All @@ -7258,16 +7262,10 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
continue;
}

InstructionCost Cost = cost(*P, VF);
InstructionCost Cost =
cost(*P, VF, ConsiderRegPressure ? &RUs[I] : nullptr);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

if (CM.shouldConsiderRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
<< VF << " because it uses too many registers\n");
continue;
}

if (isMoreProfitable(CurrentFactor, BestFactor, P->hasScalarTail()))
BestFactor = CurrentFactor;

Expand Down
29 changes: 22 additions & 7 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,13 +389,28 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A,
return Base::properlyDominates(ParentA, ParentB);
}

bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs) const {
return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
return LU.second > (OverrideMaxNumRegs > 0
? OverrideMaxNumRegs
: TTI.getNumberOfRegisters(LU.first));
});
InstructionCost VPRegisterUsage::spillCost(VPCostContext &Ctx,
unsigned OverrideMaxNumRegs) const {
InstructionCost Cost;
for (const auto &[RegClass, MaxUsers] : MaxLocalUsers) {
unsigned AvailableRegs = OverrideMaxNumRegs > 0
? OverrideMaxNumRegs
: Ctx.TTI.getNumberOfRegisters(RegClass);
if (MaxUsers > AvailableRegs) {
// Assume that for each register used past what's available we get one
// spill and reload.
unsigned Spills = MaxUsers - AvailableRegs;
InstructionCost SpillCost =
Ctx.TTI.getRegisterClassSpillCost(RegClass, Ctx.CostKind) +
Ctx.TTI.getRegisterClassReloadCost(RegClass, Ctx.CostKind);
InstructionCost TotalCost = Spills * SpillCost;
LLVM_DEBUG(dbgs() << "LV(REG): Cost of " << TotalCost << " from "
<< Spills << " spills of "
<< Ctx.TTI.getRegisterClassName(RegClass) << "\n");
Cost += TotalCost;
}
}
return Cost;
}

SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
Expand Down
11 changes: 7 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ class VPlan;
class Value;
class TargetTransformInfo;
class Type;
class InstructionCost;

struct VPCostContext;

/// An analysis for type-inference for VPValues.
/// It infers the scalar type for a given VPValue by bottom-up traversing
Expand Down Expand Up @@ -79,11 +82,11 @@ struct VPRegisterUsage {
/// The key is ClassID of target-provided register class.
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;

/// Check if any of the tracked live intervals exceeds the number of
/// available registers for the target. If non-zero, OverrideMaxNumRegs
/// Calculate the estimated cost of any spills due to using more registers
/// than the number available for the target. If non-zero, OverrideMaxNumRegs
/// is used in place of the target's number of registers.
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs = 0) const;
InstructionCost spillCost(VPCostContext &Ctx,
unsigned OverrideMaxNumRegs = 0) const;
};

/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
; REQUIRES: asserts
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=false -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOMAX
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-REGS-VP
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth=true -debug-only=loop-vectorize,vplan -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NOREGS-VP

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"

; The use of the dotp instruction means we never have an i32 vector, so we don't
; get any spills normally and with a reduced number of registers the number of
; spills is small enough that it doesn't prevent use of a larger VF.
define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'dotp'
;
; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
;
; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-REGS-VP: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
; CHECK-REGS-VP: Cost for VF vscale x 16: 5 (Estimated cost per lane: 0.3)
; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
;
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 14 (Estimated cost per lane: 1.8)
; CHECK-NOREGS-VP: LV(REG): Cost of 4 from 2 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 13 (Estimated cost per lane: 0.8)
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 16.
entry:
br label %for.body

Expand All @@ -24,8 +39,7 @@ for.body: ; preds = %for.body, %entry
%load.b = load i8, ptr %gep.b, align 1
%ext.b = zext i8 %load.b to i32
%mul = mul i32 %ext.b, %ext.a
%sub = sub i32 0, %mul
%add = add i32 %accum, %sub
%add = add i32 %accum, %mul
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.exit, label %for.body
Expand All @@ -34,4 +48,70 @@ for.exit: ; preds = %for.body
ret i32 %add
}

; The largest type used in the loop is small enough that we already consider all
; VFs and maximize-bandwidth does nothing.
define void @type_too_small(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'type_too_small'
; CHECK: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK: Cost for VF vscale x 8: 6 (Estimated cost per lane: 0.8)
; CHECK: Cost for VF vscale x 16: 6 (Estimated cost per lane: 0.4)
; CHECK: LV: Selecting VF: vscale x 16.
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep.a = getelementptr i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%add = add i8 %load.a, %load.b
store i8 %add, ptr %gep.a, align 1
%iv.next = add i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1024
br i1 %exitcond, label %exit, label %loop

exit:
ret void
}

; With reduced number of registers the spills from high pressure are enough that
; we use the same VF as if we hadn't maximized the bandwidth.
define void @high_pressure(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'high_pressure'
;
; CHECK-NOMAX: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-NOMAX: LV: Selecting VF: vscale x 4.
;
; CHECK-REGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-REGS-VP: Cost for VF vscale x 8: 10 (Estimated cost per lane: 1.2)
; CHECK-REGS-VP: Cost for VF vscale x 16: 21 (Estimated cost per lane: 1.3)
; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.

; CHECK-NOREGS-VP: Cost for VF vscale x 4: 6 (Estimated cost per lane: 1.5)
; CHECK-NOREGS-VP: LV(REG): Cost of 6 from 3 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 8: 20 (Estimated cost per lane: 2.5)
; CHECK-NOREGS-VP: LV(REG): Cost of 14 from 7 spills of Generic::VectorRC
; CHECK-NOREGS-VP-NEXT: Cost for VF vscale x 16: 39 (Estimated cost per lane: 2.4)
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%gep.a = getelementptr i32, ptr %a, i64 %iv
%load.a = load i32, ptr %gep.a, align 4
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%ext.b = zext i8 %load.b to i32
%add = add i32 %load.a, %ext.b
store i32 %add, ptr %gep.a, align 4
%iv.next = add i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 1024
br i1 %exitcond, label %exit, label %loop

exit:
ret void
}

attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
Loading