-
Notifications
You must be signed in to change notification settings - Fork 16.8k
[VPlan] Skip epilogue vectorization if dead after narrowing IGs. #187016
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6840253
441c784
89b3c26
f628bca
c04a80a
3409ee0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4441,7 +4441,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( | |
| } | ||
|
|
||
| VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | ||
| const ElementCount MainLoopVF, unsigned IC) { | ||
| ElementCount MainLoopVF, unsigned IC) { | ||
| VectorizationFactor Result = VectorizationFactor::Disabled(); | ||
| if (!EnableEpilogueVectorization) { | ||
| LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); | ||
|
|
@@ -4485,6 +4485,25 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
| return Result; | ||
| } | ||
|
|
||
| // Check if a plan's vector loop processes fewer iterations than VF (e.g. when | ||
| // interleave groups have been narrowed) narrowInterleaveGroups) and return | ||
| // the adjusted, effective VF. | ||
| using namespace VPlanPatternMatch; | ||
| auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount { | ||
| auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock(); | ||
| if (match(&Exiting->back(), | ||
| m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())), | ||
| m_VPValue()))) | ||
| return ElementCount::get(1, VF.isScalable()); | ||
| return VF; | ||
| }; | ||
|
|
||
| // Check if the main loop processes fewer than MainLoopVF elements per | ||
| // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF | ||
| // as needed. | ||
| VPlan &MainPlan = getPlanFor(MainLoopVF); | ||
| MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF); | ||
|
|
||
| // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know | ||
| // the main loop handles 8 lanes per iteration. We could still benefit from | ||
| // vectorizing the epilogue loop with VF=4. | ||
|
|
@@ -4494,8 +4513,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
| Type *TCType = Legal->getWidestInductionType(); | ||
| const SCEV *RemainingIterations = nullptr; | ||
| unsigned MaxTripCount = 0; | ||
| const SCEV *TC = vputils::getSCEVExprForVPValue( | ||
| getPlanFor(MainLoopVF).getTripCount(), PSE); | ||
| const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE); | ||
| assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable"); | ||
| const SCEV *KnownMinTC; | ||
| bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); | ||
|
|
@@ -4538,29 +4556,32 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( | |
| if (!hasPlanWithVF(NextVF.Width)) | ||
| continue; | ||
|
|
||
| ElementCount EffectiveVF = | ||
| GetEffectiveVF(getPlanFor(NextVF.Width), NextVF.Width); | ||
| // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable | ||
| // vectors) or > the VF of the main loop (fixed vectors). | ||
| if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && | ||
| ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || | ||
| (NextVF.Width.isScalable() && | ||
| ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || | ||
| (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && | ||
| ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) | ||
| if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() && | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to have a test showing the effect of this change too. It looks like this new code is also effectively disabling epilogue vectorisation since we'll discard any VF > 1. I suspect we don't even get as far as the SkipVF code below.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be covered by existing epilogue tests; if we don't adjust the VFs here (and below) we will disable epilogue vectorization in cases where we should after narrowing interleave groups in both the main and epilogue plan, because we would compare VF = 1 from main loop to VF > 1 from the epilogue plan. This is when vectorizing the epilogue due to interleaving the vector loop. |
||
| ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) || | ||
| (EffectiveVF.isScalable() && | ||
| ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) || | ||
| (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() && | ||
| ElementCount::isKnownGT(EffectiveVF, MainLoopVF))) | ||
| continue; | ||
|
|
||
| // If NextVF is greater than the number of remaining iterations, the | ||
| // epilogue loop would be dead. Skip such factors. | ||
| // If EffectiveVF is greater than the number of remaining iterations, the | ||
| // epilogue loop would be dead. Skip such factors. If the epilogue plan | ||
| // also has narrowed interleave groups, use the effective VF since | ||
| // the epilogue step will be reduced to its IC. | ||
| // TODO: We should also consider comparing against a scalable | ||
| // RemainingIterations when SCEV be able to evaluate non-canonical | ||
| // vscale-based expressions. | ||
| if (!ScalableRemIter) { | ||
| // Handle the case where NextVF and RemainingIterations are in different | ||
| // numerical spaces. | ||
| ElementCount EC = NextVF.Width; | ||
| if (NextVF.Width.isScalable()) | ||
| EC = ElementCount::getFixed( | ||
| estimateElementCount(NextVF.Width, CM.getVScaleForTuning())); | ||
| if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations)) | ||
| // Handle the case where EffectiveVF and RemainingIterations are in | ||
| // different numerical spaces. | ||
| if (EffectiveVF.isScalable()) | ||
| EffectiveVF = ElementCount::getFixed( | ||
| estimateElementCount(EffectiveVF, CM.getVScaleForTuning())); | ||
| if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations)) | ||
| continue; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 | ||
| ; RUN: opt -passes=loop-vectorize -mcpu=neoverse-v2 -S %s | FileCheck %s | ||
|
|
||
| target triple = "arm64-apple-macosx" | ||
|
|
||
| ; Test that epilogue vectorization is not selected when the main vector loop | ||
| ; covers all iterations after narrowInterleaveGroups reduces the effective | ||
| ; step from VF * UF to UF. | ||
| define void @no_epilogue_when_narrowed_covers_all(ptr %p) { | ||
| ; CHECK-LABEL: define void @no_epilogue_when_narrowed_covers_all( | ||
| ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { | ||
| ; CHECK-NEXT: [[ENTRY:.*:]] | ||
| ; CHECK-NEXT: br label %[[VECTOR_PH:.*]] | ||
| ; CHECK: [[VECTOR_PH]]: | ||
| ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] | ||
| ; CHECK: [[VECTOR_BODY]]: | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] | ||
| ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 4 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[OFFSET_IDX]] | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP0]] | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP1]] | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]] | ||
| ; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP3]], align 8 | ||
| ; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP4]], align 8 | ||
| ; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP5]], align 8 | ||
| ; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP6]], align 8 | ||
| ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 | ||
| ; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] | ||
| ; CHECK: [[MIDDLE_BLOCK]]: | ||
| ; CHECK-NEXT: br label %[[EXIT:.*]] | ||
| ; CHECK: [[EXIT]]: | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| entry: | ||
| br label %loop | ||
|
|
||
| loop: | ||
| %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] | ||
| %p0 = getelementptr inbounds i64, ptr %p, i64 %iv | ||
| %p1 = getelementptr inbounds i64, ptr %p0, i64 1 | ||
| store i64 1, ptr %p0, align 8 | ||
| store i64 1, ptr %p1, align 8 | ||
| %iv.next = add nuw nsw i64 %iv, 2 | ||
| %ec = icmp eq i64 %iv.next, 1000 | ||
| br i1 %ec, label %exit, label %loop | ||
|
|
||
| exit: | ||
| ret void | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't
EstimatedRuntimeVFalso need updating otherwise it's inconsistent withMainLoopVF.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep,
EstimatedRuntimeVFshould only be computed after we adjustedMainLoopVF