Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,9 +300,17 @@ class LoopVectorizationLegality {
/// masking.
bool canFoldTailByMasking() const;

/// Returns true if all instructions in the loop support masking or
/// speculation.
///
/// The mask may be loop-invariant if it represents a maximum safe dependence
/// distance (alias mask) or loop-variant if it is based on the induction
/// variable (e.g. tail-folding).
bool canMaskLoop() const;

/// Mark all respective loads/stores for masking. Must only be called when
/// tail-folding is possible.
void prepareToFoldTailByMasking();
/// masking is possible.
void prepareToMaskLoop();

/// Returns the primary induction variable.
PHINode *getPrimaryInduction() { return PrimaryInduction; }
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
case Intrinsic::loop_dependence_war_mask:
return true;
default:
return false;
}
Expand Down
15 changes: 11 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2131,6 +2131,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
}
}

if (!canMaskLoop())
return false;

LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");

return true;
}

bool LoopVectorizationLegality::canMaskLoop() const {
// The list of pointers that we can safely read and write to remains empty.
SmallPtrSet<Value *, 8> SafePointers;

Expand All @@ -2139,17 +2148,15 @@ bool LoopVectorizationLegality::canFoldTailByMasking() const {
SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
for (BasicBlock *BB : TheLoop->blocks()) {
if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking.\n");
LLVM_DEBUG(dbgs() << "LV: Cannot mask loop.\n");
return false;
}
}

LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");

return true;
}

void LoopVectorizationLegality::prepareToFoldTailByMasking() {
void LoopVectorizationLegality::prepareToMaskLoop() {
// The list of pointers that we can safely read and write to remains empty.
SmallPtrSet<Value *, 8> SafePointers;

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
bool HasBranchWeights) const;

VPValue *materializeAliasMask(VPlan &Plan,
ArrayRef<PointerDiffInfo> DiffChecks,
bool HasBranchWeights);

#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
Expand Down
139 changes: 123 additions & 16 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
STATISTIC(LoopsPartialAliasVectorized,
"Number of partial aliasing loops vectorized");

static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));

static cl::opt<bool> ForcePartialAliasingVectorization(
"force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
cl::desc("Replace pointer diff checks with alias masks."));

// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
Expand Down Expand Up @@ -1386,6 +1392,47 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

/// Returns true if all loop blocks should be masked.
bool allLoopBlocksMasked() const {
return foldTailByMasking() || maskPartialAliasing();
}

void checkIfPartialAliasMaskingIsEnabled() {
assert(!IsPartialAliasMaskingEnabled &&
"Partial alias masking already checked!");
if (!ForcePartialAliasingVectorization || !Legal->canMaskLoop() ||
!Legal->getFixedOrderRecurrences().empty()) {
// Option not enabled (or loop cannot be masked).
// Note: FixedOrderRecurrences are not supported yet as we cannot handle
// the required `splice.right` with the alias-mask.
IsPartialAliasMaskingEnabled = false;
return;
}
const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
if (!Checks) {
// Runtime checks not needed for this loop (no alias mask required).
IsPartialAliasMaskingEnabled = false;
return;
}
if (auto DiffChecks = Checks->getDiffChecks()) {
// We have diff checks. We can use an alias mask.
IsPartialAliasMaskingEnabled = !DiffChecks->empty();
return;
}
// Runtime checks are not diff checks (can't be replaced with alias mask).
IsPartialAliasMaskingEnabled = false;
}

void disablePartialAliasMaskingIfEnabled() {
if (IsPartialAliasMaskingEnabled)
IsPartialAliasMaskingEnabled = false;
}

/// Returns true if all loop blocks should have partial aliases masked.
bool maskPartialAliasing() const {
return IsPartialAliasMaskingEnabled.value_or(false);
}

/// Returns true if the use of wide lane masks is requested and the loop is
/// using tail-folding with a lane mask for control flow.
bool useWideActiveLaneMask() const {
Expand All @@ -1410,7 +1457,7 @@ class LoopVectorizationCostModel {
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
return foldTailByMasking() || Legal->blockNeedsPredication(BB);
return allLoopBlocksMasked() || Legal->blockNeedsPredication(BB);
}

/// Returns true if VP intrinsics with explicit vector length support should
Expand Down Expand Up @@ -1604,6 +1651,9 @@ class LoopVectorizationCostModel {
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
ChosenTailFoldingStyle;

/// true if partial alias masking is enabled (nullopt = undecided).
std::optional<bool> IsPartialAliasMaskingEnabled;

/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;

Expand Down Expand Up @@ -1825,14 +1875,18 @@ class GeneratedRTChecks {
/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;

/// True if the loop is alias-masked (which allows us to omit diff checks).
bool LoopUsesAliasMasking = false;

public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
TTI::TargetCostKind CostKind)
TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
: DT(DT), LI(LI), TTI(TTI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
PSE(PSE), CostKind(CostKind) {}
PSE(PSE), CostKind(CostKind),
LoopUsesAliasMasking(LoopUsesAliasMasking) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
Expand Down Expand Up @@ -1885,7 +1939,7 @@ class GeneratedRTChecks {
}

const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
if (RtPtrChecking.Need) {
if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
Expand Down Expand Up @@ -2883,8 +2937,8 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
if (Legal->blockNeedsPredication(I->getParent()))
return true;

// If we're not folding the tail by masking, predication is unnecessary.
if (!foldTailByMasking())
// If we're not masking, predication is unnecessary.
if (!allLoopBlocksMasked())
return false;

// All that remain are instructions with side-effects originally executed in
Expand Down Expand Up @@ -3088,10 +3142,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);

int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
// In order to be widened, the pointer should be consecutive, first of all.
if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
if (!Stride)
return false;

// Currently, we can't handle alias masking in reverse. Reversing the alias
// mask is not correct (or necessary). When combined with tail-folding the ALM
// should only be reversed where the alias-mask is true.
if (Stride < 0)
disablePartialAliasMaskingIfEnabled();

// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I, VF))
Expand Down Expand Up @@ -3613,6 +3674,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return FixedScalableVFPair::getNone();
}

checkIfPartialAliasMaskingIsEnabled();

switch (ScalarEpilogueStatus) {
case CM_ScalarEpilogueAllowed:
return computeFeasibleMaxVF(MaxTC, UserVF, UserIC, false);
Expand Down Expand Up @@ -4451,6 +4514,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

if (CM.maskPartialAliasing()) {
LLVM_DEBUG(
dbgs()
<< "LEV: Epilogue vectorization not supported with alias masking");
return Result;
}

// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
Expand Down Expand Up @@ -5729,7 +5799,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
// stores. Note that even with tail folding we know that at least
// one lane is active (i.e. generalized predication is not possible
// here), and the logic below depends on this fact.
if (!foldTailByMasking())
if (!allLoopBlocksMasked())
return true;

// For scalable vectors, a uniform memop load is always
Expand Down Expand Up @@ -6824,8 +6894,8 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}

if (CM.foldTailByMasking())
Legal->prepareToFoldTailByMasking();
if (CM.allLoopBlocksMasked())
Legal->prepareToMaskLoop();

ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
Expand Down Expand Up @@ -6937,7 +7007,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
// TODO: Remove this code after stepping away from the legacy cost model and
// adding code to simplify VPlans before calculating their costs.
auto TC = getSmallConstantTripCount(PSE.getSE(), OrigLoop);
if (TC == VF && !CM.foldTailByMasking())
if (TC == VF && !CM.allLoopBlocksMasked())
addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),
CostCtx.SkipCostComputation);

Expand Down Expand Up @@ -7431,6 +7501,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// compactness.
attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);

VPValue *ClampedVF = nullptr;
if (CM.maskPartialAliasing()) {
ClampedVF = materializeAliasMask(
BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
HasBranchWeights);
++LoopsPartialAliasVectorized;
}

// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());

Expand Down Expand Up @@ -7467,6 +7545,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
CM.requiresScalarEpilogue(BestVF.isVector()));
VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::simplifyRecipes(BestVPlan);
Expand Down Expand Up @@ -8237,7 +8316,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
// Predicate and linearize the top-level loop region.
// ---------------------------------------------------------------------------
RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize, *Plan,
CM.foldTailByMasking());
CM.foldTailByMasking(), CM.maskPartialAliasing());

// ---------------------------------------------------------------------------
// Construct wide recipes and apply predication for original scalar
Expand Down Expand Up @@ -8483,9 +8562,9 @@ void LoopVectorizationPlanner::addReductionResultComputation(
// with fewer lanes than the VF. So the operands of the select would have
// different numbers of lanes. Partial reductions mask the input instead.
auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe());
if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
if (!PhiR->isInLoop() && CM.allLoopBlocksMasked() &&
(!RR || !RR->isPartialReduction())) {
VPValue *Cond = vputils::findHeaderMask(*Plan);
VPValue *Cond = vputils::findLoopBodyMask(*Plan);
VPIRFlags Flags = PhiTy->isFloatingPointTy()
? VPIRFlags(RdxDesc.getFastMathFlags())
: VPIRFlags();
Expand Down Expand Up @@ -8688,6 +8767,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}

VPValue *LoopVectorizationPlanner::materializeAliasMask(
VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
Plan, MinVFCheck,
*CM.Legal->getRuntimePointerChecking()->getDiffChecks());
VPBuilder Builder(MinVFCheck);
Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
// Check the "ClampedVF" from the alias mask contains at least two elements.
VPValue *Cond = Builder.createICmp(
CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
return ClampedVF;
}

void LoopVectorizationPlanner::addMinimumIterationCheck(
VPlan &Plan, ElementCount VF, unsigned UF,
ElementCount MinProfitableTripCount) const {
Expand Down Expand Up @@ -8800,7 +8894,8 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);

{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
CM.maskPartialAliasing());
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
Expand Down Expand Up @@ -9649,7 +9744,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
CM.maskPartialAliasing());
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
Expand Down Expand Up @@ -9768,6 +9864,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IC = 1;
}

if (CM.maskPartialAliasing()) {
LLVM_DEBUG(
dbgs()
<< "LV: Not interleaving due to partial aliasing vectorization.\n");
IntDiagMsg = {
"PartialAliasingVectorization",
"Unable to interleave due to partial aliasing vectorization."};
InterleaveLoop = false;
IC = 1;
}

// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
Expand Down
Loading
Loading