Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -391,12 +391,12 @@ class LoopVectorizationLegality {

/// Returns true if value V is uniform across \p VF lanes, when \p VF is
/// provided, and otherwise if \p V is invariant across all loop iterations.
bool isUniform(Value *V, ElementCount VF) const;
bool isUniform(Value *V, std::optional<ElementCount> VF) const;

/// A uniform memory op is a load or store which accesses the same memory
/// location on all \p VF lanes, if \p VF is provided and otherwise if the
/// memory location is invariant.
bool isUniformMemOp(Instruction &I, ElementCount VF) const;
bool isUniformMemOp(Instruction &I, std::optional<ElementCount> VF) const;

/// Returns the information that we collected about runtime memory check.
const RuntimePointerChecking *getRuntimePointerChecking() const {
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
case Intrinsic::loop_dependence_war_mask:
return true;
default:
return false;
}
Expand Down
13 changes: 7 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -585,12 +585,13 @@ class SCEVAddRecForUniformityRewriter

} // namespace

bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
bool LoopVectorizationLegality::isUniform(
Value *V, std::optional<ElementCount> VF) const {
if (isInvariant(V))
return true;
if (VF.isScalable())
if (!VF || VF->isScalable())
return false;
if (VF.isScalar())
if (VF->isScalar())
return true;

// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
Expand All @@ -602,7 +603,7 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {

// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
// lane 0 matches the expressions for all other lanes.
unsigned FixedVF = VF.getKnownMinValue();
unsigned FixedVF = VF->getKnownMinValue();
const SCEV *FirstLaneExpr =
SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
Expand All @@ -618,8 +619,8 @@ bool LoopVectorizationLegality::isUniform(Value *V, ElementCount VF) const {
});
}

bool LoopVectorizationLegality::isUniformMemOp(Instruction &I,
ElementCount VF) const {
bool LoopVectorizationLegality::isUniformMemOp(
Instruction &I, std::optional<ElementCount> VF) const {
Value *Ptr = getLoadStorePointerOperand(&I);
if (!Ptr)
return false;
Expand Down
166 changes: 155 additions & 11 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
STATISTIC(LoopsPartialAliasVectorized,
"Number of partial aliasing loops vectorized");

static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -205,6 +207,10 @@ static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
cl::desc("Assume the target supports masked memory operations (used for "
"testing)."));

static cl::opt<bool> ForcePartialAliasingVectorization(
"force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
cl::desc("Replace pointer diff checks with alias masks."));

// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
Expand Down Expand Up @@ -861,6 +867,8 @@ enum ScalarEpilogueLowering {
CM_ScalarEpilogueNotAllowedUsePredicate
};

enum class AliasMaskingStatus { NotDecided, Disabled, Enabled };

/// LoopVectorizationCostModel - estimates the expected speedups due to
/// vectorization.
/// In many cases vectorization is not profitable. This can happen because of
Expand Down Expand Up @@ -1371,6 +1379,64 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}

void tryToEnablePartialAliasMasking() {
assert(foldTailByMasking() && "Expected tail folding to be enabled!");
assert(!foldTailWithEVL() &&
"Did not expect to enable alias masking with EVL!");
assert(PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided);

// Assume we fail to enable alias masking (in case we early exit).
PartialAliasMaskingStatus = AliasMaskingStatus::Disabled;

// Note: FixedOrderRecurrences are not supported yet as we cannot handle
// the required `splice.right` with the alias-mask.
if (!ForcePartialAliasingVectorization ||
!Legal->getFixedOrderRecurrences().empty())
return;

const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
if (!Checks)
return;

auto DiffChecks = Checks->getDiffChecks();
if (!DiffChecks || DiffChecks->empty())
return;

[[maybe_unused]] auto HasPointerArgs = [](CallBase *CB) {
return any_of(CB->args(), [](Value const *Arg) {
return Arg->getType()->isPointerTy();
});
};

for (BasicBlock *BB : TheLoop->blocks()) {
for (Instruction &I : *BB) {
if (!isa<LoadInst, StoreInst>(I)) {
[[maybe_unused]] auto *Call = dyn_cast<CallInst>(&I);
assert((!I.mayReadOrWriteMemory() || Call && !HasPointerArgs(Call)) &&
"Skipped unexpected memory access");
continue;
}

Type *ScalarTy = getLoadStoreType(&I);
Value *Ptr = getLoadStorePointerOperand(&I);

// Currently, we can't handle alias masking in reverse. Reversing the
// alias mask is not correct (or necessary). When combined with
// tail-folding the active lane mask should only be reversed where the
// alias-mask is true.
if (Legal->isConsecutivePtr(ScalarTy, Ptr) == -1)
return;
}
}

PartialAliasMaskingStatus = AliasMaskingStatus::Enabled;
}

/// Returns true if all loop blocks should have partial aliases masked.
bool maskPartialAliasing() const {
return PartialAliasMaskingStatus == AliasMaskingStatus::Enabled;
}

/// Returns true if the use of wide lane masks is requested and the loop is
/// using tail-folding with a lane mask for control flow.
bool useWideActiveLaneMask() const {
Expand Down Expand Up @@ -1488,6 +1554,33 @@ class LoopVectorizationCostModel {
/// initialized during object construction.
std::optional<unsigned> VScaleForTuning;

/// Wrapper around LoopVectorizationLegality::isUniform() that takes into
/// account if alias-masking is enabled. We consider the VF to be unknown when
/// alias masking.
bool isUniform(Value *V, ElementCount VF) const {
// With alias-masking our runtime VF is [2, VF] (and not necessarily a
// power-of-two). Something that is uniform for VF may not be for the full
// range.
assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided &&
"alias-mask status must be decided already");
return Legal->isUniform(V, PartialAliasMaskingStatus ==
AliasMaskingStatus::Disabled
? std::optional(VF)
: std::nullopt);
}

/// Wrapper around LoopVectorizationLegality::isUniformMemOp() that takes into
/// account if alias-masking is enabled. We consider the VF to be unknown when
/// alias masking.
bool isUniformMemOp(Instruction &I, ElementCount VF) const {
assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided &&
"alias-mask status must be decided already");
return Legal->isUniformMemOp(I, PartialAliasMaskingStatus ==
AliasMaskingStatus::Disabled
? std::optional(VF)
: std::nullopt);
}

/// Initializes the value of vscale used for tuning the cost model. If
/// vscale_range.min == vscale_range.max then return vscale_range.max, else
/// return the value returned by the corresponding TTI method.
Expand Down Expand Up @@ -1592,6 +1685,9 @@ class LoopVectorizationCostModel {
/// Control finally chosen tail folding style.
TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;

/// If partial alias masking is enabled/disabled or not decided.
AliasMaskingStatus PartialAliasMaskingStatus = AliasMaskingStatus::NotDecided;

/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;

Expand Down Expand Up @@ -1813,14 +1909,18 @@ class GeneratedRTChecks {
/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;

/// True if the loop is alias-masked (which allows us to omit diff checks).
bool LoopUsesAliasMasking = false;

public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
TTI::TargetCostKind CostKind)
TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
: DT(DT), LI(LI), TTI(TTI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
PSE(PSE), CostKind(CostKind) {}
PSE(PSE), CostKind(CostKind),
LoopUsesAliasMasking(LoopUsesAliasMasking) {}

/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
Expand Down Expand Up @@ -1873,7 +1973,10 @@ class GeneratedRTChecks {
}

const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
if (RtPtrChecking.Need) {
// TODO: We need to estimate the cost of alias-masking in
// GeneratedRTChecks::getCost(). We can't check the MemCheckBlock as the
// alias-mask is generated later in VPlan.
if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
Expand Down Expand Up @@ -3057,7 +3160,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (Iter != Uniforms.end() && !Iter->second.contains(I))
return false;
}
if (!Legal->isUniformMemOp(*I, VF))
if (!isUniformMemOp(*I, VF))
return false;
if (isa<LoadInst>(I))
// Loading the same address always produces the same result - at least
Expand Down Expand Up @@ -3134,7 +3237,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

// If the pointer can be proven to be uniform, always add it to the
// worklist.
if (isa<Instruction>(Ptr) && Legal->isUniform(Ptr, VF))
if (isa<Instruction>(Ptr) && isUniform(Ptr, VF))
AddToWorklistIfAllowed(cast<Instruction>(Ptr));

if (IsUniformMemOpUse(&I))
Expand Down Expand Up @@ -3453,6 +3556,12 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// Make sure once we return PartialAliasMaskingStatus is not "NotDecided".
scope_exit EnsureAliasMaskingStatusIsDecidedOnReturn([this] {
if (PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided)
PartialAliasMaskingStatus = AliasMaskingStatus::Disabled;
});

if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may be useful to do since it's still likely to be dynamically
// uniform if the target can skip.
Expand Down Expand Up @@ -3628,6 +3737,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
assert(ContainsScalableVF && "Expected scalable vector factor.");

MaxFactors.FixedVF = ElementCount::getFixed(1);
} else {
tryToEnablePartialAliasMasking();
}
return MaxFactors;
}
Expand Down Expand Up @@ -4351,6 +4462,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

if (CM.maskPartialAliasing()) {
LLVM_DEBUG(
dbgs()
<< "LEV: Epilogue vectorization not supported with alias masking.\n");
return Result;
}

// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
Expand Down Expand Up @@ -5277,7 +5395,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I, VF));
assert(isUniformMemOp(*I, VF));

Type *ValTy = getLoadStoreType(I);
Type *PtrTy = getLoadStorePointerOperand(I)->getType();
Expand Down Expand Up @@ -5316,7 +5434,7 @@ LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
Value *Ptr = getLoadStorePointerOperand(I);
Type *PtrTy = Ptr->getType();

if (!Legal->isUniform(Ptr, VF))
if (!isUniform(Ptr, VF))
PtrTy = toVectorTy(PtrTy, VF);

unsigned IID = I->getOpcode() == Instruction::Load
Expand Down Expand Up @@ -5645,7 +5763,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
NumPredStores++;

if (Legal->isUniformMemOp(I, VF)) {
if (isUniformMemOp(I, VF)) {
auto IsLegalToScalarize = [&]() {
if (!VF.isScalable())
// Scalarization of fixed length vectors "just works".
Expand Down Expand Up @@ -5816,7 +5934,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
InstWidening Decision = getWideningDecision(I, VF);
if (!isPredicatedInst(I) &&
(Decision == CM_Widen || Decision == CM_Widen_Reverse ||
(!Legal->isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
(!isUniformMemOp(*I, VF) && Decision == CM_Scalarize))) {
// Scalarize a widened load of address or update the cost of a scalar
// load of an address.
setWideningDecision(
Expand Down Expand Up @@ -7298,6 +7416,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
BestVPlan, BestVF, VScale);
}

if (CM.maskPartialAliasing()) {
assert(CM.foldTailByMasking() && "Expected tail folding to be enabled");
VPlanTransforms::materializeAliasMaskCheckBlock(
BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
HasBranchWeights);
++LoopsPartialAliasVectorized;
}

// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());

Expand Down Expand Up @@ -8215,6 +8341,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow);
}

if (CM.maskPartialAliasing())
RUN_VPLAN_PASS(VPlanTransforms::attachAliasMaskToHeaderMask, *Plan);

assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
Expand Down Expand Up @@ -8590,7 +8719,10 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);

{
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
assert(!CM.maskPartialAliasing() &&
"Did not expect to alias-mask outer loop");
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
/*UsesAliasMasking=*/false);
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
Expand Down Expand Up @@ -9429,7 +9561,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);

GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
CM.maskPartialAliasing());
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
Expand Down Expand Up @@ -9539,6 +9672,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Override IC if user provided an interleave count.
IC = UserIC > 0 ? UserIC : IC;

if (CM.maskPartialAliasing()) {
LLVM_DEBUG(
dbgs()
<< "LV: Not interleaving due to partial aliasing vectorization.\n");
IntDiagMsg = {
"PartialAliasingVectorization",
"Unable to interleave due to partial aliasing vectorization."};
InterleaveLoop = false;
IC = 1;
}

// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
Expand Down
Loading