Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -964,9 +964,8 @@ class LoopVectorizationCostModel {
/// user options, for the given register kind.
bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);

/// \return True if maximizing vector bandwidth is enabled by the target or
/// user options, for the given vector factor.
bool useMaxBandwidth(ElementCount VF);
/// \return True if register pressure should be calculated for the given VF.
bool shouldCalculateRegPressureForVF(ElementCount VF);

/// \return The size (in bits) of the smallest and widest types in the code
/// that needs to be vectorized. We ignore values that remain scalar such as
Expand Down Expand Up @@ -1753,6 +1752,9 @@ class LoopVectorizationCostModel {
/// Whether this loop should be optimized for size based on function attribute
/// or profile information.
bool OptForSize;

/// The highest VF possible for this loop, without using MaxBandwidth.
FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
};
} // end namespace llvm

Expand Down Expand Up @@ -3943,10 +3945,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}

bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
return useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector);
bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF(
ElementCount VF) {
if (!useMaxBandwidth(VF.isScalable()
? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector))
return false;
// Only calculate register pressure for VFs enabled by MaxBandwidth.
return ElementCount::isKnownGT(
VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
: MaxPermissibleVFWithoutMaxBW.FixedVF);
}

bool LoopVectorizationCostModel::useMaxBandwidth(
Expand Down Expand Up @@ -4022,6 +4030,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
: TargetTransformInfo::RGK_FixedWidthVector;
ElementCount MaxVF = MaxVectorElementCount;

if (MaxVF.isScalable())
MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
else
MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;

if (useMaxBandwidth(RegKind)) {
auto MaxVectorElementCountMaxBW = ElementCount::get(
llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),
Expand Down Expand Up @@ -4375,9 +4389,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
if (VF.isScalar())
continue;

/// Don't consider the VF if it exceeds the number of registers for the
/// target.
if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
/// If the VF was proposed due to MaxBandwidth, don't consider the VF if
/// it exceeds the number of registers for the target.
if (CM.shouldCalculateRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
continue;

InstructionCost C = CM.expectedCost(VF);
Expand Down Expand Up @@ -7155,7 +7170,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
InstructionCost Cost = cost(*P, VF);
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);

if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
if (CM.shouldCalculateRegPressureForVF(VF) &&
RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) {
LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
<< VF << " because it uses too many registers\n");
continue;
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,12 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) {
return 1;
}

bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
return any_of(MaxLocalUsers, [&TTI](auto &LU) {
return LU.second > TTI.getNumberOfRegisters(LU.first);
bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is independent of updated shouldCalculateRegPressureForVF? If so, might have been good to do separately.

return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) {
return LU.second > (OverrideMaxNumRegs > 0
? OverrideMaxNumRegs
: TTI.getNumberOfRegisters(LU.first));
});
}

Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ struct VPRegisterUsage {
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;

/// Check if any of the tracked live intervals exceeds the number of
/// available registers for the target.
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const;
/// available registers for the target. If non-zero, OverrideMaxNumRegs
/// is used in place of the target's number of registers.
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI,
unsigned OverrideMaxNumRegs = 0) const;
};

/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP
; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP

target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"

define i32 @dotp(ptr %a, ptr %b) #0 {
; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
;
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4.
entry:
br label %for.body

for.body: ; preds = %for.body, %entry
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
%gep.a = getelementptr i8, ptr %a, i64 %iv
%load.a = load i8, ptr %gep.a, align 1
%ext.a = zext i8 %load.a to i32
%gep.b = getelementptr i8, ptr %b, i64 %iv
%load.b = load i8, ptr %gep.b, align 1
%ext.b = zext i8 %load.b to i32
%mul = mul i32 %ext.b, %ext.a
%sub = sub i32 0, %mul
%add = add i32 %accum, %sub
%iv.next = add i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, 1024
br i1 %exitcond.not, label %for.exit, label %for.body

for.exit: ; preds = %for.body
ret i32 %add
}

attributes #0 = { vscale_range(1,16) "target-features"="+sve" }