[LV] Add support for partial alias masking with tail folding#182457
[LV] Add support for partial alias masking with tail folding#182457
Conversation
|
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-analysis Author: Benjamin Maxwell (MacDue) ChangesThis patch adds basic support for partial alias masking, which allows entering the vector loop even when there is aliasing within a single vector iteration. It does this by clamping the VF to the safe distance between pointers. This allows the runtime VF to be anywhere from 2 to the "static" VF. Conceptually, this transform looks like: -> This initial patch has a number of limitations:
This PR supersedes #100579 (closes #100579). Patch is 80.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182457.diff 17 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d4083c49626fe..e3cf650ddb76b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::loop_dependence_war_mask:
+ return true;
default:
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..1019849b1d011 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
bool HasBranchWeights) const;
+ VPValue *materializeAliasMask(VPlan &Plan,
+ ArrayRef<PointerDiffInfo> DiffChecks,
+ bool HasBranchWeights);
+
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..5bf474a89157b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+ "Number of partial aliasing loops vectorized");
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
+static cl::opt<bool> ForcePartialAliasingVectorization(
+ "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Replace pointer diff checks with alias masks."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1386,6 +1392,42 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ void checkIfPartialAliasMaskingIsEnabled() {
+ assert(foldTailByMasking() && "Expected tail folding to be enabled!");
+ assert(!IsPartialAliasMaskingEnabled &&
+ "Partial alias masking already checked!");
+ if (!ForcePartialAliasingVectorization ||
+ !Legal->getFixedOrderRecurrences().empty()) {
+ // Note: FixedOrderRecurrences are not supported yet as we cannot handle
+ // the required `splice.right` with the alias-mask.
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+ if (!Checks) {
+ // Runtime checks not needed for this loop (no alias mask required).
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ if (auto DiffChecks = Checks->getDiffChecks()) {
+ // We have diff checks. We can use an alias mask.
+ IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+ return;
+ }
+ // Runtime checks are not diff checks (can't be replaced with alias mask).
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ void disablePartialAliasMaskingIfEnabled() {
+ if (IsPartialAliasMaskingEnabled)
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ /// Returns true if all loop blocks should have partial aliases masked.
+ bool maskPartialAliasing() const {
+ return IsPartialAliasMaskingEnabled.value_or(false);
+ }
+
/// Returns true if the use of wide lane masks is requested and the loop is
/// using tail-folding with a lane mask for control flow.
bool useWideActiveLaneMask() const {
@@ -1604,6 +1646,9 @@ class LoopVectorizationCostModel {
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
ChosenTailFoldingStyle;
+ /// true if partial alias masking is enabled (nullopt = undecided).
+ std::optional<bool> IsPartialAliasMaskingEnabled;
+
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
@@ -1825,14 +1870,18 @@ class GeneratedRTChecks {
/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;
+ /// True if the loop is alias-masked (which allows us to omit diff checks).
+ bool LoopUsesAliasMasking = false;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- TTI::TargetCostKind CostKind)
+ TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
: DT(DT), LI(LI), TTI(TTI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
- PSE(PSE), CostKind(CostKind) {}
+ PSE(PSE), CostKind(CostKind),
+ LoopUsesAliasMasking(LoopUsesAliasMasking) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1885,7 +1934,7 @@ class GeneratedRTChecks {
}
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
- if (RtPtrChecking.Need) {
+ if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
@@ -3088,10 +3137,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
+ int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+ if (!Stride)
return false;
+ // Currently, we can't handle alias masking in reverse. Reversing the alias
+ // mask is not correct (or necessary). When combined with tail-folding the ALM
+ // should only be reversed where the alias-mask is true.
+ if (Stride < 0)
+ disablePartialAliasMaskingIfEnabled();
+
// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I, VF))
@@ -3747,6 +3803,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
assert(ContainsScalableVF && "Expected scalable vector factor.");
MaxFactors.FixedVF = ElementCount::getFixed(1);
+ } else {
+ checkIfPartialAliasMaskingIsEnabled();
}
return MaxFactors;
}
@@ -4465,6 +4523,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization not supported with alias masking");
+ return Result;
+ }
+
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -7445,6 +7510,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// compactness.
attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
+ VPValue *ClampedVF = nullptr;
+ if (CM.maskPartialAliasing()) {
+ ClampedVF = materializeAliasMask(
+ BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+ HasBranchWeights);
+ ++LoopsPartialAliasVectorized;
+ }
+
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7481,6 +7554,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
CM.requiresScalarEpilogue(BestVF.isVector()));
+ VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -8694,6 +8768,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+ VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+ VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+ VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+ Plan, MinVFCheck,
+ *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+ VPBuilder Builder(MinVFCheck);
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ // Check the "ClampedVF" from the alias mask contains at least two elements.
+ VPValue *Cond = Builder.createICmp(
+ CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+ VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+ return ClampedVF;
+}
+
void LoopVectorizationPlanner::addMinimumIterationCheck(
VPlan &Plan, ElementCount VF, unsigned UF,
ElementCount MinProfitableTripCount) const {
@@ -8806,7 +8895,8 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
{
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9677,7 +9767,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9796,6 +9887,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IC = 1;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not interleaving due to partial aliasing vectorization.\n");
+ IntDiagMsg = {
+ "PartialAliasingVectorization",
+ "Unable to interleave due to partial aliasing vectorization."};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..bd61fd4c92310 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1209,8 +1209,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
ExtractPenultimateElement,
- LogicalAnd, // Non-poison propagating logical And.
- LogicalOr, // Non-poison propagating logical Or.
+ LogicalAnd, // Non-poison propagating logical And.
+ LogicalOr, // Non-poison propagating logical Or.
+ NumActiveLanes, // Counts the number of active lanes in a mask.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..d552e1cb2c38c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -147,6 +147,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(0));
case Instruction::ExtractValue:
return cast<ExtractValueInst>(R->getUnderlyingValue())->getType();
+ case VPInstruction::NumActiveLanes:
+ return Type::getInt64Ty(Ctx);
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1af7392b904da..683c4c9bad465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1030,13 +1030,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
}
}
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ VPBasicBlock *CheckBlock,
+ bool AddBranchWeights) {
+ insertCheckBlockBeforeVectorLoop(Plan, CheckBlock);
+ addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights);
+}
+
void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
BasicBlock *CheckBlock,
bool AddBranchWeights) {
VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
- insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
- addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights);
+ attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights);
}
void VPlanTransforms::addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..e9cc6d27381f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
+ case VPInstruction::NumActiveLanes:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -609,6 +610,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
+ case VPInstruction::NumActiveLanes: {
+ Value *Op = State.get(getOperand(0));
+ auto *VecTy = cast<VectorType>(Op->getType());
+ assert(VecTy->getScalarSizeInBits() == 1 &&
+ "NumActiveLanes only implemented for i1 vectors");
+
+ Value *ZExt = Builder.CreateCast(
+ Instruction::ZExt, Op,
+ VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount()));
+ Value *Count =
+ Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
+ return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(),
+ "num.active.lanes");
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
@@ -1271,7 +1286,8 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
- getOpcode() == VPInstruction::AnyOf;
+ getOpcode() == VPInstruction::AnyOf ||
+ getOpcode() == VPInstruction::NumActiveLanes;
}
bool VPInstruction::isSingleScalar() const {
@@ -1545,6 +1561,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastActive:
O << "extract-last-active";
break;
+ case VPInstruction::NumActiveLanes:
+ O << "num-active-lanes";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..ee301712b6fcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5112,6 +5112,73 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
"VF, UF, and VFxUF not expected to be used");
}
+VPValue *
+VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+ ArrayRef<PointerDiffInfo> DiffChecks) {
+
+ VPBuilder Builder(AliasCheck, AliasCheck->begin());
+ Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
+ Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext());
+ Type *PtrTy = PointerType::getUnqual(Plan.getContext());
+
+ VPValue *AliasMask = nullptr;
+ for (PointerDiffInfo Check : DiffChecks) {
+ VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);
+ VPValue *Sink =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
+
+ VPValue *SrcPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy,
+ DebugLoc::getCompilerGenerated());
+ VPValue *SinkPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy,
+ DebugLoc::getCompilerGenerated());
+
+ VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe(
+ Intrinsic::loop_dependence_war_mask,
+ {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty);
+ Builder.insert(WARMask);
+
+ if (AliasMask)
+ AliasMask = Builder.createAnd(AliasMask, WARMask);
+ else
+ AliasMask = WARMask;
+ }
+
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ VPValue *NumActive =
+ Builder.createNaryOp(VPInstruction::NumActiveLanes, {AliasMask});
+ VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
+ NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated());
+
+ // Find the existing header mask.
+ VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
+ auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
+ if (HeaderMaskDef->isPhi())
+ Builder.setInsertPoint(&*HeaderMaskDef->getParent()->getFirstNonPhi());
+ else
+ Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
+
+ // Update all existing users of the header mask to "HeaderMask & AliasMask".
+ auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
+ HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
+ return dyn_cast<VPInstruction>(&U) != ClampedHeaderMask;
+ });
+
+ return ClampedVF;
+}
+
+void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan,
+ VPValue *ClampedVF) {
+ if (!ClampedVF)
+ return;
+
+ assert(Plan.getConcreteUF() == 1 &&
+ "Clamped VF not support with interleaving");
+ Plan.getVF().replaceAllUsesWith(ClampedVF);
+ Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
+}
+
DenseMap<const SCEV *, Value *>
VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..292a97b61817c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -180,6 +180,8 @@ struct VPlanTransforms {
/// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
/// VPValue and connect the block to \p Plan, using the VPValue as branch
/// condition.
+ static void attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ ...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Benjamin Maxwell (MacDue) ChangesThis patch adds basic support for partial alias masking, which allows entering the vector loop even when there is aliasing within a single vector iteration. It does this by clamping the VF to the safe distance between pointers. This allows the runtime VF to be anywhere from 2 to the "static" VF. Conceptually, this transform looks like: -> This initial patch has a number of limitations:
This PR supersedes #100579 (closes #100579). Patch is 80.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/182457.diff 17 Files Affected:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d4083c49626fe..e3cf650ddb76b 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -170,6 +170,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+ case Intrinsic::loop_dependence_war_mask:
+ return true;
default:
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 54bb073eb4f81..1019849b1d011 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -668,6 +668,10 @@ class LoopVectorizationPlanner {
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
bool HasBranchWeights) const;
+ VPValue *materializeAliasMask(VPlan &Plan,
+ ArrayRef<PointerDiffInfo> DiffChecks,
+ bool HasBranchWeights);
+
#ifndef NDEBUG
/// \return The most profitable vectorization factor for the available VPlans
/// and the cost of that VF.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6299e8c2dbd32..5bf474a89157b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -170,6 +170,8 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
+STATISTIC(LoopsPartialAliasVectorized,
+ "Number of partial aliasing loops vectorized");
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -198,6 +200,10 @@ static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
"vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
cl::desc("The maximum allowed number of runtime memory checks"));
+static cl::opt<bool> ForcePartialAliasingVectorization(
+ "force-partial-aliasing-vectorization", cl::init(false), cl::Hidden,
+ cl::desc("Replace pointer diff checks with alias masks."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -1386,6 +1392,42 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ void checkIfPartialAliasMaskingIsEnabled() {
+ assert(foldTailByMasking() && "Expected tail folding to be enabled!");
+ assert(!IsPartialAliasMaskingEnabled &&
+ "Partial alias masking already checked!");
+ if (!ForcePartialAliasingVectorization ||
+ !Legal->getFixedOrderRecurrences().empty()) {
+ // Note: FixedOrderRecurrences are not supported yet as we cannot handle
+ // the required `splice.right` with the alias-mask.
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
+ if (!Checks) {
+ // Runtime checks not needed for this loop (no alias mask required).
+ IsPartialAliasMaskingEnabled = false;
+ return;
+ }
+ if (auto DiffChecks = Checks->getDiffChecks()) {
+ // We have diff checks. We can use an alias mask.
+ IsPartialAliasMaskingEnabled = !DiffChecks->empty();
+ return;
+ }
+ // Runtime checks are not diff checks (can't be replaced with alias mask).
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ void disablePartialAliasMaskingIfEnabled() {
+ if (IsPartialAliasMaskingEnabled)
+ IsPartialAliasMaskingEnabled = false;
+ }
+
+ /// Returns true if all loop blocks should have partial aliases masked.
+ bool maskPartialAliasing() const {
+ return IsPartialAliasMaskingEnabled.value_or(false);
+ }
+
/// Returns true if the use of wide lane masks is requested and the loop is
/// using tail-folding with a lane mask for control flow.
bool useWideActiveLaneMask() const {
@@ -1604,6 +1646,9 @@ class LoopVectorizationCostModel {
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
ChosenTailFoldingStyle;
+ /// true if partial alias masking is enabled (nullopt = undecided).
+ std::optional<bool> IsPartialAliasMaskingEnabled;
+
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
@@ -1825,14 +1870,18 @@ class GeneratedRTChecks {
/// The kind of cost that we are calculating
TTI::TargetCostKind CostKind;
+ /// True if the loop is alias-masked (which allows us to omit diff checks).
+ bool LoopUsesAliasMasking = false;
+
public:
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
LoopInfo *LI, TargetTransformInfo *TTI,
- TTI::TargetCostKind CostKind)
+ TTI::TargetCostKind CostKind, bool LoopUsesAliasMasking)
: DT(DT), LI(LI), TTI(TTI),
SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
- PSE(PSE), CostKind(CostKind) {}
+ PSE(PSE), CostKind(CostKind),
+ LoopUsesAliasMasking(LoopUsesAliasMasking) {}
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -1885,7 +1934,7 @@ class GeneratedRTChecks {
}
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
- if (RtPtrChecking.Need) {
+ if (RtPtrChecking.Need && !LoopUsesAliasMasking) {
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
"vector.memcheck");
@@ -3088,10 +3137,17 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
auto *Ptr = getLoadStorePointerOperand(I);
auto *ScalarTy = getLoadStoreType(I);
+ int Stride = Legal->isConsecutivePtr(ScalarTy, Ptr);
// In order to be widened, the pointer should be consecutive, first of all.
- if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
+ if (!Stride)
return false;
+ // Currently, we can't handle alias masking in reverse. Reversing the alias
+ // mask is not correct (or necessary). When combined with tail-folding the ALM
+ // should only be reversed where the alias-mask is true.
+ if (Stride < 0)
+ disablePartialAliasMaskingIfEnabled();
+
// If the instruction is a store located in a predicated block, it will be
// scalarized.
if (isScalarWithPredication(I, VF))
@@ -3747,6 +3803,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
assert(ContainsScalableVF && "Expected scalable vector factor.");
MaxFactors.FixedVF = ElementCount::getFixed(1);
+ } else {
+ checkIfPartialAliasMaskingIsEnabled();
}
return MaxFactors;
}
@@ -4465,6 +4523,13 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LEV: Epilogue vectorization not supported with alias masking");
+ return Result;
+ }
+
// Not really a cost consideration, but check for unsupported cases here to
// simplify the logic.
if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
@@ -7445,6 +7510,14 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// compactness.
attachRuntimeChecks(BestVPlan, ILV.RTChecks, HasBranchWeights);
+ VPValue *ClampedVF = nullptr;
+ if (CM.maskPartialAliasing()) {
+ ClampedVF = materializeAliasMask(
+ BestVPlan, *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
+ HasBranchWeights);
+ ++LoopsPartialAliasVectorized;
+ }
+
// Retrieving VectorPH now when it's easier while VPlan still has Regions.
VPBasicBlock *VectorPH = cast<VPBasicBlock>(BestVPlan.getVectorPreheader());
@@ -7481,6 +7554,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
VPlanTransforms::materializeVectorTripCount(
BestVPlan, VectorPH, CM.foldTailByMasking(),
CM.requiresScalarEpilogue(BestVF.isVector()));
+ VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF);
VPlanTransforms::materializeFactors(BestVPlan, VectorPH, BestVF);
VPlanTransforms::cse(BestVPlan);
VPlanTransforms::simplifyRecipes(BestVPlan);
@@ -8694,6 +8768,21 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
}
}
+VPValue *LoopVectorizationPlanner::materializeAliasMask(
+ VPlan &Plan, ArrayRef<PointerDiffInfo> DiffChecks, bool HasBranchWeights) {
+ VPBasicBlock *MinVFCheck = Plan.createVPBasicBlock("vector.min.vf.check");
+ VPValue *ClampedVF = VPlanTransforms::materializeAliasMask(
+ Plan, MinVFCheck,
+ *CM.Legal->getRuntimePointerChecking()->getDiffChecks());
+ VPBuilder Builder(MinVFCheck);
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ // Check the "ClampedVF" from the alias mask contains at least two elements.
+ VPValue *Cond = Builder.createICmp(
+ CmpInst::ICMP_ULT, ClampedVF, Plan.getConstantInt(IVTy, 2), {}, "cmp.vf");
+ VPlanTransforms::attachCheckBlock(Plan, Cond, MinVFCheck, HasBranchWeights);
+ return ClampedVF;
+}
+
void LoopVectorizationPlanner::addMinimumIterationCheck(
VPlan &Plan, ElementCount VF, unsigned UF,
ElementCount MinProfitableTripCount) const {
@@ -8806,7 +8895,8 @@ static bool processLoopInVPlanNativePath(
VPlan &BestPlan = LVP.getPlanFor(VF.Width);
{
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
Checks, BestPlan);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -9677,7 +9767,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
if (ORE->allowExtraAnalysis(LV_NAME))
LVP.emitInvalidCostRemarks(ORE);
- GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
+ GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind,
+ CM.maskPartialAliasing());
if (LVP.hasPlanWithVF(VF.Width)) {
// Select the interleave count.
IC = LVP.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
@@ -9796,6 +9887,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
IC = 1;
}
+ if (CM.maskPartialAliasing()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Not interleaving due to partial aliasing vectorization.\n");
+ IntDiagMsg = {
+ "PartialAliasingVectorization",
+ "Unable to interleave due to partial aliasing vectorization."};
+ InterleaveLoop = false;
+ IC = 1;
+ }
+
// Emit diagnostic messages, if any.
const char *VAPassName = Hints.vectorizeAnalysisPassName();
if (!VectorizeLoop && !InterleaveLoop) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index a0c23df0b3c38..bd61fd4c92310 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1209,8 +1209,9 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// part if it is scalar. In the latter case, the recipe will be removed
// during unrolling.
ExtractPenultimateElement,
- LogicalAnd, // Non-poison propagating logical And.
- LogicalOr, // Non-poison propagating logical Or.
+ LogicalAnd, // Non-poison propagating logical And.
+ LogicalOr, // Non-poison propagating logical Or.
+ NumActiveLanes, // Counts the number of active lanes in a mask.
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 4b744b9128171..d552e1cb2c38c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -147,6 +147,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
return inferScalarType(R->getOperand(0));
case Instruction::ExtractValue:
return cast<ExtractValueInst>(R->getUnderlyingValue())->getType();
+ case VPInstruction::NumActiveLanes:
+ return Type::getInt64Ty(Ctx);
default:
break;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 1af7392b904da..683c4c9bad465 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -1030,13 +1030,19 @@ static void addBypassBranch(VPlan &Plan, VPBasicBlock *CheckBlockVPBB,
}
}
+void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ VPBasicBlock *CheckBlock,
+ bool AddBranchWeights) {
+ insertCheckBlockBeforeVectorLoop(Plan, CheckBlock);
+ addBypassBranch(Plan, CheckBlock, Cond, AddBranchWeights);
+}
+
void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
BasicBlock *CheckBlock,
bool AddBranchWeights) {
VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
- insertCheckBlockBeforeVectorLoop(Plan, CheckBlockVPBB);
- addBypassBranch(Plan, CheckBlockVPBB, CondVPV, AddBranchWeights);
+ attachCheckBlock(Plan, CondVPV, CheckBlockVPBB, AddBranchWeights);
}
void VPlanTransforms::addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 33cb1509565d5..e9cc6d27381f3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -461,6 +461,7 @@ unsigned VPInstruction::getNumOperandsForOpcode() const {
case VPInstruction::ResumeForEpilogue:
case VPInstruction::Reverse:
case VPInstruction::Unpack:
+ case VPInstruction::NumActiveLanes:
return 1;
case Instruction::ICmp:
case Instruction::FCmp:
@@ -609,6 +610,20 @@ Value *VPInstruction::generate(VPTransformState &State) {
{PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, Name);
}
+ case VPInstruction::NumActiveLanes: {
+ Value *Op = State.get(getOperand(0));
+ auto *VecTy = cast<VectorType>(Op->getType());
+ assert(VecTy->getScalarSizeInBits() == 1 &&
+ "NumActiveLanes only implemented for i1 vectors");
+
+ Value *ZExt = Builder.CreateCast(
+ Instruction::ZExt, Op,
+ VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount()));
+ Value *Count =
+ Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt);
+ return Builder.CreateCast(Instruction::ZExt, Count, Builder.getInt64Ty(),
+ "num.active.lanes");
+ }
case VPInstruction::FirstOrderRecurrenceSplice: {
// Generate code to combine the previous and current values in vector v3.
//
@@ -1271,7 +1286,8 @@ bool VPInstruction::isVectorToScalar() const {
getOpcode() == VPInstruction::ComputeAnyOfResult ||
getOpcode() == VPInstruction::ExtractLastActive ||
getOpcode() == VPInstruction::ComputeReductionResult ||
- getOpcode() == VPInstruction::AnyOf;
+ getOpcode() == VPInstruction::AnyOf ||
+ getOpcode() == VPInstruction::NumActiveLanes;
}
bool VPInstruction::isSingleScalar() const {
@@ -1545,6 +1561,9 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
case VPInstruction::ExtractLastActive:
O << "extract-last-active";
break;
+ case VPInstruction::NumActiveLanes:
+ O << "num-active-lanes";
+ break;
default:
O << Instruction::getOpcodeName(getOpcode());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 22a8edaf30eb6..ee301712b6fcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -5112,6 +5112,73 @@ void VPlanTransforms::materializeFactors(VPlan &Plan, VPBasicBlock *VectorPH,
"VF, UF, and VFxUF not expected to be used");
}
+VPValue *
+VPlanTransforms::materializeAliasMask(VPlan &Plan, VPBasicBlock *AliasCheck,
+ ArrayRef<PointerDiffInfo> DiffChecks) {
+
+ VPBuilder Builder(AliasCheck, AliasCheck->begin());
+ Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext());
+ Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext());
+ Type *PtrTy = PointerType::getUnqual(Plan.getContext());
+
+ VPValue *AliasMask = nullptr;
+ for (PointerDiffInfo Check : DiffChecks) {
+ VPValue *Src = vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);
+ VPValue *Sink =
+ vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
+
+ VPValue *SrcPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, PtrTy,
+ DebugLoc::getCompilerGenerated());
+ VPValue *SinkPtr =
+ Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, PtrTy,
+ DebugLoc::getCompilerGenerated());
+
+ VPWidenIntrinsicRecipe *WARMask = new VPWidenIntrinsicRecipe(
+ Intrinsic::loop_dependence_war_mask,
+ {SrcPtr, SinkPtr, Plan.getConstantInt(I64Ty, Check.AccessSize)}, I1Ty);
+ Builder.insert(WARMask);
+
+ if (AliasMask)
+ AliasMask = Builder.createAnd(AliasMask, WARMask);
+ else
+ AliasMask = WARMask;
+ }
+
+ Type *IVTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
+ VPValue *NumActive =
+ Builder.createNaryOp(VPInstruction::NumActiveLanes, {AliasMask});
+ VPValue *ClampedVF = Builder.createScalarZExtOrTrunc(
+ NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated());
+
+ // Find the existing header mask.
+ VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan);
+ auto *HeaderMaskDef = HeaderMask->getDefiningRecipe();
+ if (HeaderMaskDef->isPhi())
+ Builder.setInsertPoint(&*HeaderMaskDef->getParent()->getFirstNonPhi());
+ else
+ Builder = VPBuilder::getToInsertAfter(HeaderMaskDef);
+
+ // Update all existing users of the header mask to "HeaderMask & AliasMask".
+ auto *ClampedHeaderMask = Builder.createAnd(HeaderMask, AliasMask);
+ HeaderMask->replaceUsesWithIf(ClampedHeaderMask, [&](VPUser &U, unsigned) {
+ return dyn_cast<VPInstruction>(&U) != ClampedHeaderMask;
+ });
+
+ return ClampedVF;
+}
+
+void VPlanTransforms::fixupVFUsersForClampedVF(VPlan &Plan,
+ VPValue *ClampedVF) {
+ if (!ClampedVF)
+ return;
+
+ assert(Plan.getConcreteUF() == 1 &&
+ "Clamped VF not support with interleaving");
+ Plan.getVF().replaceAllUsesWith(ClampedVF);
+ Plan.getVFxUF().replaceAllUsesWith(ClampedVF);
+}
+
DenseMap<const SCEV *, Value *>
VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f2dfc166cecc9..292a97b61817c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -180,6 +180,8 @@ struct VPlanTransforms {
/// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a
/// VPValue and connect the block to \p Plan, using the VPValue as branch
/// condition.
+ static void attachCheckBlock(VPlan &Plan, VPValue *Cond,
+ ...
[truncated]
|
gbossu
left a comment
There was a problem hiding this comment.
I'll do a proper review later, but I left some high-level comments for now. Code looks clean though! :)
| VPlanTransforms::materializeVectorTripCount( | ||
| BestVPlan, VectorPH, CM.foldTailByMasking(), | ||
| CM.requiresScalarEpilogue(BestVF.isVector())); | ||
| VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF); |
There was a problem hiding this comment.
Nit: AFAIU, a ClampedVF (as in, runtime-clamped) is a new concept? Would it be worth using a different name maybe? I feel it's too easy to confuse with the clamping that the LoopVectorizationPlanner does, which is completely different. I guess we will only clamp down at runtime, so maybe something like CappedRuntimeVF?
Feel free to ignore if you have a different opinion, I just thought I'd write this because I also got confused with the ClampedVF name in the previous PR.
e828d42 to
eb03b88
Compare
llvm/test/Transforms/LoopVectorize/VPlan/AArch64/vplan-printing-alias-mask.ll
Outdated
Show resolved
Hide resolved
llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing-alias-mask.ll
Outdated
Show resolved
Hide resolved
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
da77b59 to
5949679
Compare
I've already done that. Have a look inside |
| CM.requiresScalarEpilogue(BestVF.isVector())); | ||
| // Do a late fix-up of the VF to replace any additional users of VF since the | ||
| // alias mask was materialized. | ||
| VPlanTransforms::fixupVFUsersForClampedVF(BestVPlan, ClampedVF); |
There was a problem hiding this comment.
Continuing the discussion at #177599 (comment)
Otherwise we end up with an incorrect VPlan throughout the pipeline.
I don't see it as an incorrect VPlan. Before VF is materialized it's a symbolic value that represents the runtime VF for the plan. For plans with alias-masking the runtime VF will be the number of lanes in the mask.
I don't think that matches the definition of VF today with tail folding. With tail folding the VF is always just the vector width regardless of the number of active lanes, and we materialize the VF as the vector width. Otherwise wouldn't ClampedVF just be VF then?
EVL folding handles the "variable step" or clamped VF as its called in this PR with VPEVLBasedIVPHIRecipe. arcbbb is working on generalizing it so it can be reused in contexts like this, renaming it to VPCurrentIterationPHIRecipe: #177114
I'm not sure if this applies here? With alias-masking there is not a "variable step". The step is fixed/loop-invariant, it's just not known until the runtime (in the pre-header). It's not that dissimilar to scalable VFs in that regard.
Yeah the clamped VF isn't variable, but the thing that needs fixed up is the fact that the canonical IV increment is no longer VFxUF, so recipes need updated to reflect that. With scalable VFs the increment is still VFxUF.
My main concern is that we might have transforms that depend on e.g. VPWidenIntOrFpInductionRecipe/VPScalarIVStepsRecipe having the correct value. If another transform introduces a VPWidenIntOrFpInductionRecipe with Plan.getVF() as an operand in between materializeAliasMask and the late call to fixupVFUsersForClampedVF, then it won't see that the VF operand should really be clampedVF.
I think we need to go through and separate out the users of VPlan.getVF() and figure out which ones are the "number of elements processed this iteration" and which ones are the "width of the vector type". The former would become the clamped VF with partial alias masking, EVL with EVL tail folding, and the faulting lane in vp.load.ff. It can be loop variant or invariant. The latter is the VF as we know today.
This is probably a larger chunk of work so I wouldn't block this PR on it. But I'd like to agree on a long-term direction which allows us to keep the vplan correct throughout, and avoids duplicating work between all the different non-VFxUF incrementing types of vectorization.
There was a problem hiding this comment.
My main concern is that we might have transforms that depend on e.g. VPWidenIntOrFpInductionRecipe/VPScalarIVStepsRecipe having the correct value. If another transform introduces a VPWidenIntOrFpInductionRecipe with Plan.getVF() as an operand in between materializeAliasMask and the late call to fixupVFUsersForClampedVF, then it won't see that the VF operand should really be clampedVF.
I think ensuring this is the key point, together with directly materializing all values depending on the clamped VF. As for the first point, we should be able to automatically catch this #182318
e6bb1bc to
56ff8ab
Compare
sdesmalen-arm
left a comment
There was a problem hiding this comment.
Generally looks fine to me, barring a few nits.
| NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated()); | ||
|
|
||
| // Find the existing header mask. | ||
| VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan); |
There was a problem hiding this comment.
It would probably be good to check for the header mask at the outset, and either assert that it has been found or don't perform the alias mask transform if we cannot find it;
I think the current findHeaderMask implementation may miss some cases at the current position, e.g. https://llvm.godbolt.org/z/doz3rbKd3
There was a problem hiding this comment.
It's too late not to perform alias-masking at this point, we've already not emitted the standard diff checks.
There was a problem hiding this comment.
I think it'd make more sense to introduce a symbolic value for the alias mask while it's still easy to find the header mask. Waiting so late and matching N-possible forms is bound to be brittle.
There was a problem hiding this comment.
I've updated things so the alias-mask is attached early (just after the active-lane-mask transform), so we don't need to match whatever the final optimized form is. This also will help with implementing the non-tail-folded case.
There was a problem hiding this comment.
It's too late not to perform alias-masking at this point, we've already not emitted the standard diff checks.
Right, curious if we actually need to make the decision early or if we could generate the diff checks up front, and then decide purely on the info in VPlan whether we use them or the alias mask approach? Diff checks + block should be autoamtically discarded when not used. Not saying this is necessarily needed for the initial patch, but may be a good thing to check as follow-up
4e15776 to
a73ac40
Compare
| /// Represents the loop-invariant alias of the vector loop region. | ||
| VPSymbolicValue AliasMask; |
There was a problem hiding this comment.
I think it would be preferable if we don't have to add a new VPSymbolicValue for the AliasMask, which in most cases won't be used.
Can we instead add a VPInstruction opcode?
There was a problem hiding this comment.
Purely out of curiosity: why is is adding a new (potentially unused) VPSymbolicValue problematic? Is it because it then requires a new transformation to materialise it, and adding a new VPinstruction instead would require smaller and more localised code changes?
There was a problem hiding this comment.
Can we instead add a VPInstruction opcode?
I think it could be made to work, but it's less than ideal, since now it's a real instruction, that you have to find again in subsequent transforms, or store a reference to somewhere.
There was a problem hiding this comment.
I've now switched to using a VPInstruction in the vector preheader to emulate a VPSymbolicValue.
There was a problem hiding this comment.
(Personally, I think tidying up VPSymbolicValue handling to so adding new values is not costly would be preferable, something like: #187041).
| NumActive, IVTy, I64Ty, DebugLoc::getCompilerGenerated()); | ||
|
|
||
| // Find the existing header mask. | ||
| VPSingleDefRecipe *HeaderMask = vputils::findHeaderMask(Plan); |
There was a problem hiding this comment.
It's too late not to perform alias-masking at this point, we've already not emitted the standard diff checks.
Right, curious if we actually need to make the decision early or if we could generate the diff checks up front, and then decide purely on the info in VPlan whether we use them or the alias mask approach? Diff checks + block should be autoamtically discarded when not used. Not saying this is necessarily needed for the initial patch, but may be a good thing to check as follow-up
| if (CM.maskPartialAliasing()) { | ||
| LLVM_DEBUG( | ||
| dbgs() | ||
| << "LEV: Epilogue vectorization not supported with alias masking"); |
There was a problem hiding this comment.
Nit: Maybe leave a comment in the code why that is?
| /// Represents the loop-invariant alias of the vector loop region. | ||
| VPSymbolicValue AliasMask; |
There was a problem hiding this comment.
Purely out of curiosity: why is is adding a new (potentially unused) VPSymbolicValue problematic? Is it because it then requires a new transformation to materialise it, and adding a new VPinstruction instead would require smaller and more localised code changes?
| for (BasicBlock *BB : TheLoop->blocks()) { | ||
| for (Instruction &I : *BB) { | ||
| if (!isa<LoadInst, StoreInst>(I)) { | ||
| assert(isa<CallInst>(I) || !I.mayReadOrWriteMemory()); |
There was a problem hiding this comment.
Ah right, is there any way to tighten the assert for calls, like requiring that it only accesses inaccessible memory or at least no pointer args? So it is easier to catch and update the code here to do the correct thing, if the restriction gets removed?
There was a problem hiding this comment.
I've changed this to assert(!I.mayReadOrWriteMemory() || Call && !HasPointerArgs(Call)), as LAA will currently only accept calls without pointer arguments.
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
eas
left a comment
There was a problem hiding this comment.
Why should VPlan operate in terms of "partial mask" (sounds rather ISA-specific) and not something more logical/high-level like MaxSafeNumIters with the particular lowering to a target-specific intrinsic introduced only in the later stage? For example, that could have been introduced as first supporting
for (...)
a[I+3] = [I];with "effective" VF == 3 and then expanding that infrastructure to support non-constant distances which would require buy-in from the target.
That would have made the infrastructure available to more backends and could result in a more target-agnostic/high-level design of the VPlan representation.
| Type *Ty = State.TypeAnalysis.inferScalarType(this); | ||
| Value *ZExt = Builder.CreateCast( | ||
| Instruction::ZExt, Op, | ||
| VectorType::get(Builder.getInt32Ty(), VecTy->getElementCount())); | ||
| Value *Count = | ||
| Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, ZExt); | ||
| return Builder.CreateCast(Instruction::ZExt, Count, Ty, "num.active.lanes"); |
There was a problem hiding this comment.
There was a problem hiding this comment.
ctpop does not work for scalable vectors.
Note: There's an instcombine that will turn fixed-vector mask reductions to ctpop (which on AArch64 we have to undo, as that's not desirable at all from a codegen perspective).
We already have a target-agnostic intrinsic that represents the alias-mask. I'm not sure why other backends can't support/pattern match it if they want to support this feature? Additionally, to support alias masking (which we plan to support without tail folding), we require masked operations. If we hide the notion of a mask to some late stage, it becomes much harder to rewrite the plan later.
That's fine, but our first step here was to relax the existing runtime diff checks. I don't see an issue in extending the masking approach to compile-time known distances in the future. |
That's actually another question - why later in the pipeline vs doing it in a way similar to ? I think
matches what the tail folding transformation is doing rather closely (also not exactly), so why two similar things should be done in such distant points in the pipeline? |
fhahn
left a comment
There was a problem hiding this comment.
For example, that could have been introduced as first supporting
That's fine, but our first step here was to relax the existing runtime diff checks. I don't see an issue in extending the masking approach to compile-time known distances in the future.
IIUC we would need masking in both cases, just with different lowerings. IIRC there was some plan/desire to support non-power-of-2 VFs on RISCV in a somewhat similar fashion.
Conceptually, this transform looks like ...
matches what the tail folding transformation is doing rather closely (also not exactly), so why two similar things should be done in such distant points in the pipeline?
I think the goal of the initial patch is to add initial support for the simplest cases, where everything is already masked as needed due to tail-folding.
I think @MacDue is planning to handle the general case as follow-up
| VPBuilder Builder(AliasCheckVPBB); | ||
| Type *I1Ty = IntegerType::getInt1Ty(Plan.getContext()); | ||
| Type *I64Ty = IntegerType::getInt64Ty(Plan.getContext()); | ||
| Type *PtrTy = PointerType::getUnqual(Plan.getContext()); |
There was a problem hiding this comment.
I think we already restrict the diff checks currently to have addrspace 0. Not sure if we can assert here, but may be good to add a test cases as well with addresspace != 0
There was a problem hiding this comment.
There is a test case with addresspace != 0, however, we won't create a pointer with an address space here. I'm not sure we can actually recover that information here. So the codegen looks like:
%a_int = ptrtoaddr ptr addrspace(1) %a to i64
%b_int = ptrtoaddr ptr addrspace(1) %b to i64
%a_new = inttoptr i64 %a_int to ptr
%b_new = inttoptr i64 %b_int to ptr
call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a_new, ptr %b_new, i64 1)
I'm not sure if that's really an issue though. I think passing pointers to llvm.loop.dependence.mask was a bit of a mistake, as the mask does not care about anything but the address bits.
I think an more ideal form would be: call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(i64 %diff, i64 1).
There was a problem hiding this comment.
If that would work well with codeine, that seems like a nice simplification
This patch adds basic support for partial alias masking, which allows
entering the vector loop even when there is aliasing within a single
vector iteration. It does this by clamping the VF to the safe distance
between pointers. This allows the runtime VF to be anywhere from 2 to
the "static" VF.
Conceptually, this transform looks like:
```
// `c` and `b` may alias.
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
```
->
```
svbool_t alias_mask = loop.dependence.war.mask(b, c);
int num_active = num_active_lanes(mask);
if (num_active >= 2) {
for (int i = 0; i < n; i += num_active) {
// ... vector loop masked with `alias_mask`
}
}
// ... scalar tail
```
This initial patch has a number of limitations:
- The loop must be tail-folded
* We intend to follow-up with full alias-masking support for loops
without tail-folding
- The mask and transform is only valid for IC = 1
* Some recipes may not handle the "ClampedVF" correctly at IC > 1
* Note: On AArch64, we also only have native alias mask instructions
for IC = 1
- Reverse iteration is not supported
* The mask reversal logic is not correct for the alias mask
(or clamped ALM)
- First order recurrences are not supported
* The `splice.right` is not lowered correctly for clamped VFs
- This style of vectorization is not enabled by default/costed
* It can be enabled with `-force-partial-aliasing-vectorization`
* When enabled, alias masking is used instead of the standard diff
checks (when legal to do so)
This PR supersedes llvm#100579 (closes llvm#100579).
Rebase tests
Fixups
Fixups
Fixups
Fixups
Fixups
Fixups
Add test
Fixups
Attach alias-mask early
Use VPInstruction in preheader
Fixups
|
Is it possible to have fixed-width vectorization together with this partial alias masking? If so, would be good to check that for define void @vf_dependent_uniform(ptr noalias %p, ptr %p.out, ptr %p.in) {
entry:
br label %header
header:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %header ]
%div = sdiv i64 %iv, 64
%gep = getelementptr inbounds i64, ptr %p, i64 %div
%ld = load i64, ptr %gep, align 8
%gep2 = getelementptr inbounds i64, ptr %p.in, i64 %iv
%ld2 = load i64, ptr %gep2, align 8
%val = add i64 %ld, %ld2
%store.gep = getelementptr i64, ptr %p.out, i64 %iv
store i64 %val, ptr %store.gep, align 8
%iv.next = add nsw i64 %iv, 1
%exitcond = icmp slt i64 %iv.next, 128
br i1 %exitcond, label %header, label %exit
exit:
ret void
}
|
|
We won't use an alias mask with that loop as we currently require consecutive strides in the loop (though that restriction probably can be lifted here). But yes, I think in general |
I've added some more checks around uniforms and added this test. In practice I think we'd only want to use alias-masking on AArch64 for scalable vectors, but the implementation can handle fixed-width vectors. |
|
|
||
| ; Tests `%div = sdiv i64 %iv, 64` (and its use for a load) is not considered | ||
| ; to be uniform with partial alias masking. | ||
| define void @vf_dependent_uniform(ptr noalias %p, ptr %p.out, ptr %p.in, i64 %n) { |
There was a problem hiding this comment.
With the latest changes, uniform loads/stores are also allowed, right? Do we have a test for that? I'd expect a test change for that, maybe I missed it? If not would be good to add for both uniform load and uniform store, and maybe uniform store of reduction result.
Would probably also be worth moving out to a separate file.
| @@ -1499,6 +1565,21 @@ class LoopVectorizationCostModel { | |||
| /// initialized during object construction. | |||
| std::optional<unsigned> VScaleForTuning; | |||
|
|
|||
| bool isUniform(Value *V, ElementCount VF) const { | |||
There was a problem hiding this comment.
| bool isUniform(Value *V, ElementCount VF) const { | |
| /// ... | |
| bool isUniform(Value *V, ElementCount VF) const { |
| Legal->isUniform(V, VF); | ||
| } | ||
|
|
||
| bool isUniformMemOp(Instruction &I, ElementCount VF) const { |
There was a problem hiding this comment.
| bool isUniformMemOp(Instruction &I, ElementCount VF) const { | |
| /// .... | |
| bool isUniformMemOp(Instruction &I, ElementCount VF) const { |
| // With alias-masking our runtime VF is [2, VF] (and not necessarily a | ||
| // power-of-two). Something that is uniform for VF may not be for the full | ||
| // range. | ||
| assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided); |
There was a problem hiding this comment.
| assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided); | |
| assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided && "alias-mask status must be decided already"); |
or something like that, same below
This patch adds basic support for partial alias masking, which allows entering the vector loop even when there is aliasing within a single vector iteration. It does this by clamping the VF to the safe distance between pointers. This allows the runtime VF to be anywhere from 2 to the "static" VF.
Conceptually, this transform looks like:
->
This initial patch has a number of limitations:
splice.rightis not lowered correctly for clamped VFs-force-partial-aliasing-vectorizationThis PR supersedes #100579 (closes #100579).