-
Notifications
You must be signed in to change notification settings - Fork 16.1k
[VPlan] Split out optimizeEVLMasks. NFC #174925
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Addresses part of llvm#153144 and splits off part of llvm#166164 There are two parts to the EVL transform: 1) Convert the loop so the number of elements processed each iteration is EVL, not VF. The IV and header mask are replaced with EVL-based variants. 2) Optimize users of the EVL based header mask to VP intrinsic based recipes. 1) changes the semantics of the vector loop region, whereas the 2) needs to preserve it. This splits 2) out so we don't mix the two up, and allows us to move 1) earlier in the pipeline in a future PR.
|
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Luke Lau (lukel97) ChangesAddresses part of #153144 and splits off part of #166164 There are two parts to the EVL transform:
Full diff: https://github.com/llvm/llvm-project/pull/174925.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 31c44fc46c973..903fb7b25ad6a 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8328,10 +8328,12 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
*Plan, CM.getMinimalBitwidths());
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);
- // TODO: try to put it close to addActiveLaneMask().
- if (CM.foldTailWithEVL())
+ // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
+ if (CM.foldTailWithEVL()) {
VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
*Plan, CM.getMaxSafeElements());
+ VPlanTransforms::runPass(VPlanTransforms::optimizeEVLMasks, *Plan);
+ }
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b1880517a4199..e1dba4cf4a064 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2979,8 +2979,42 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
return nullptr;
}
-/// Replace recipes with their EVL variants.
-static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
+/// Optimize away any EVL-based header masks to VP intrinsic based recipes.
+/// The transforms here need to preserve the original semantics.
+void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) {
+ // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL
+ VPInstruction *HeaderMask = nullptr;
+ for (VPRecipeBase &R : *Plan.getVectorLoopRegion()->getEntryBasicBlock()) {
+ if (match(&R, m_ICmp(m_VPInstruction<VPInstruction::StepVector>(),
+ m_EVL(m_VPValue())))) {
+ HeaderMask = cast<VPInstruction>(&R);
+ break;
+ }
+ }
+ if (!HeaderMask)
+ return;
+
+ VPTypeAnalysis TypeInfo(Plan);
+ VPValue *EVL = HeaderMask->getOperand(1);
+
+ for (VPUser *U : collectUsersRecursively(HeaderMask)) {
+ VPRecipeBase *R = cast<VPRecipeBase>(U);
+ if (auto *NewR = optimizeMaskToEVL(HeaderMask, *R, TypeInfo, *EVL)) {
+ NewR->insertBefore(R);
+ for (auto [Old, New] :
+ zip_equal(R->definedValues(), NewR->definedValues()))
+ Old->replaceAllUsesWith(New);
+ // Erase dead stores, the rest will be removed by removeDeadRecipes.
+ if (R->getNumDefinedValues() == 0)
+ R->eraseFromParent();
+ }
+ }
+ removeDeadRecipes(Plan);
+}
+
+/// After replacing the IV with a EVL-based IV, fixup recipes that use VF to use
+/// the EVL instead to avoid incorrect updates on the penultimate iteration.
+static void fixupVFUsersForEVL(VPlan &Plan, VPValue &EVL) {
VPTypeAnalysis TypeInfo(Plan);
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
@@ -3008,10 +3042,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
return isa<VPWidenPointerInductionRecipe>(U);
});
- // Defer erasing recipes till the end so that we don't invalidate the
- // VPTypeAnalysis cache.
- SmallVector<VPRecipeBase *> ToErase;
-
// Create a scalar phi to track the previous EVL if fixed-order recurrence is
// contained.
bool ContainsFORs =
@@ -3046,7 +3076,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
R.getDebugLoc());
VPSplice->insertBefore(&R);
R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
- ToErase.push_back(&R);
}
}
}
@@ -3067,43 +3096,6 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
CmpInst::ICMP_ULT,
Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
HeaderMask->replaceAllUsesWith(EVLMask);
- ToErase.push_back(HeaderMask->getDefiningRecipe());
-
- // Try to optimize header mask recipes away to their EVL variants.
- // TODO: Split optimizeMaskToEVL out and move into
- // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
- // tryToBuildVPlanWithVPRecipes beforehand.
- for (VPUser *U : collectUsersRecursively(EVLMask)) {
- auto *CurRecipe = cast<VPRecipeBase>(U);
- VPRecipeBase *EVLRecipe =
- optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
- if (!EVLRecipe)
- continue;
-
- unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
- assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
- "New recipe must define the same number of values as the "
- "original.");
- EVLRecipe->insertBefore(CurRecipe);
- if (isa<VPSingleDefRecipe, VPWidenLoadEVLRecipe, VPInterleaveEVLRecipe>(
- EVLRecipe)) {
- for (unsigned I = 0; I < NumDefVal; ++I) {
- VPValue *CurVPV = CurRecipe->getVPValue(I);
- CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
- }
- }
- ToErase.push_back(CurRecipe);
- }
- // Remove dead EVL mask.
- if (EVLMask->getNumUsers() == 0)
- ToErase.push_back(EVLMask->getDefiningRecipe());
-
- for (VPRecipeBase *R : reverse(ToErase)) {
- SmallVector<VPValue *> PossiblyDead(R->operands());
- R->eraseFromParent();
- for (VPValue *Op : PossiblyDead)
- recursivelyDeleteDeadRecipes(Op);
- }
}
/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
@@ -3201,7 +3193,7 @@ void VPlanTransforms::addExplicitVectorLength(
DebugLoc::getCompilerGenerated(), "avl.next");
AVLPhi->addOperand(NextAVL);
- transformRecipestoEVLRecipes(Plan, *VPEVL);
+ fixupVFUsersForEVL(Plan, *VPEVL);
// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index f4ecee4e7f04f..35af0a76019dc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -266,6 +266,14 @@ struct VPlanTransforms {
addExplicitVectorLength(VPlan &Plan,
const std::optional<unsigned> &MaxEVLSafeElements);
+ /// Optimize recipes which use an EVL based header mask to a VP intrinsic:
+ ///
+ /// %mask = icmp step-vector, EVL
+ /// %load = load %ptr, %mask
+ /// -->
+ /// %load = vp.load %ptr, EVL
+ static void optimizeEVLMasks(VPlan &Plan);
+
// For each Interleave Group in \p InterleaveGroups replace the Recipes
// widening its memory instructions with a single VPInterleaveRecipe at its
// insertion point.
|
| /// Optimize away any EVL-based header masks to VP intrinsic based recipes. | ||
| /// The transforms here need to preserve the original semantics. | ||
| void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) { | ||
| // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the code use m_SpecificICmp to look for icmp ult?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in e5dbe66, but I had to add a specific VPRecipeBase overload for SpecificCmp_match so we can directly call match on VPRecipeBase. Regular Cmp_match does the same thing
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Grea,t hat useful in any cae
|
|
||
| /// Optimize recipes which use an EVL based header mask to a VP intrinsic: | ||
| /// | ||
| /// %mask = icmp step-vector, EVL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// %mask = icmp step-vector, EVL | |
| /// %mask = icmp ult step-vector, EVL |
?
Mel-Chen
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
artagnon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
| addExplicitVectorLength(VPlan &Plan, | ||
| const std::optional<unsigned> &MaxEVLSafeElements); | ||
|
|
||
| /// Optimize recipes which use an EVL based header mask to a VP intrinsic: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// Optimize recipes which use an EVL based header mask to a VP intrinsic: | |
| /// Optimize recipes which use an EVL-based header mask to VP intrinsics, for example |
| removeDeadRecipes(Plan); | ||
| } | ||
|
|
||
| /// After replacing the IV with a EVL-based IV, fixup recipes that use VF to use |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| /// After replacing the IV with a EVL-based IV, fixup recipes that use VF to use | |
| /// After replacing the canonical IV with a EVL-based IV, fixup recipes that use VF to use |
| for (VPRecipeBase *OldR : OldRecipes) | ||
| OldR->eraseFromParent(); | ||
| removeDeadRecipes(Plan); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a reason to run full removeDeadRecipes insead of the original code? If so, would be good to add a quick commen explaining why both is needed? (removeDeadRecieps won't remove stores)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calling removeDeadRecipes was just simpler than having to worry about erasing the old recipes in reverse order, passing through the old recipe's operands etc. Not strongly opinionated about it though and can restore it if you prefer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I guess it would be better to just move the existing code over, otherwise it may not be NFC as we may remove more recipes than previously earlier on possibly causing other changes elsewhere
| @@ -266,6 +266,14 @@ struct VPlanTransforms { | |||
| addExplicitVectorLength(VPlan &Plan, | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
migh be a good opporunity to clarfiy the documentation here, perhaps worth mentioning that this replaces the header mask.
/// replaces all uses except the canonical IV increment of
/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe.
it may also have to introduce a new phi to track the EVL of the previous iteration
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Clarified the documentation in e550c42
fhahn
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, with a few more minor suggestions, thanks
| /// Optimize away any EVL-based header masks to VP intrinsic based recipes. | ||
| /// The transforms here need to preserve the original semantics. | ||
| void VPlanTransforms::optimizeEVLMasks(VPlan &Plan) { | ||
| // Find the EVL-based header mask if it exists: icmp ult step-vector, EVL |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Grea,t hat useful in any cae
| for (VPRecipeBase *OldR : OldRecipes) | ||
| OldR->eraseFromParent(); | ||
| removeDeadRecipes(Plan); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I guess it would be better to just move the existing code over, otherwise it may not be NFC as we may remove more recipes than previously earlier on possibly causing other changes elsewhere
| /// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and | ||
| /// replaces all uses except the canonical IV increment of | ||
| /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe | ||
| /// is used only for loop iterations counting after this transformation. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: consistent alignment
| /// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and | |
| /// replaces all uses except the canonical IV increment of | |
| /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe | |
| /// is used only for loop iterations counting after this transformation. | |
| /// - Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and | |
| /// replaces all uses except the canonical IV increment of | |
| /// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe | |
| /// is used only for loop iterations counting after this transformation. |
| /// - Plans with FORs have a new phi added to keep track of the EVL of the | ||
| /// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with | ||
| /// @llvm.vp.splice. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: consistent alignment
| /// - Plans with FORs have a new phi added to keep track of the EVL of the | |
| /// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with | |
| /// @llvm.vp.splice. | |
| /// - Plans with FORs have a new phi added to keep track of the EVL of the | |
| /// previous iteration, and VPFirstOrderRecurrencePHIRecipes are replaced with | |
| /// @llvm.vp.splice. |
| } | ||
|
|
||
| /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and | ||
| /// Converts a tail folded vector loop region to step by @llvm.get.vector.length |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
might be good to mention the VPINstruction here too
Addresses part of llvm#153144 and splits off part of llvm#166164 There are two parts to the EVL transform: 1) Convert the loop so the number of elements processed each iteration is EVL, not VF. The IV and header mask are replaced with EVL-based variants. 2) Optimize users of the EVL based header mask to VP intrinsic based recipes. (1) changes the semantics of the vector loop region, whereas (2) needs to preserve them. This splits (2) out so we don't mix the two up, and allows us to move (1) earlier in the pipeline in a future PR.
Addresses part of llvm#153144 and splits off part of llvm#166164 There are two parts to the EVL transform: 1) Convert the loop so the number of elements processed each iteration is EVL, not VF. The IV and header mask are replaced with EVL-based variants. 2) Optimize users of the EVL based header mask to VP intrinsic based recipes. (1) changes the semantics of the vector loop region, whereas (2) needs to preserve them. This splits (2) out so we don't mix the two up, and allows us to move (1) earlier in the pipeline in a future PR.
Addresses part of #153144 and splits off part of #166164
There are two parts to the EVL transform:
(1) changes the semantics of the vector loop region, whereas (2) needs to preserve them. This splits (2) out so we don't mix the two up, and allows us to move (1) earlier in the pipeline in a future PR.