Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -609,8 +609,8 @@ class LoopVectorizationPlanner {
/// \return The most profitable vectorization factor and the cost of that VF
/// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if
/// epilogue vectorization is not supported for the loop.
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MainLoopVF, unsigned IC);
VectorizationFactor selectEpilogueVectorizationFactor(ElementCount MainLoopVF,
unsigned IC);

/// Emit remarks for recipes with invalid costs in the available VPlans.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE);
Expand Down
57 changes: 39 additions & 18 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4441,7 +4441,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
}

VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
const ElementCount MainLoopVF, unsigned IC) {
ElementCount MainLoopVF, unsigned IC) {
VectorizationFactor Result = VectorizationFactor::Disabled();
if (!EnableEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
Expand Down Expand Up @@ -4485,6 +4485,25 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

// Check if a plan's vector loop processes fewer iterations than VF (e.g. when
// interleave groups have been narrowed) narrowInterleaveGroups) and return
// the adjusted, effective VF.
using namespace VPlanPatternMatch;
auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
if (match(&Exiting->back(),
m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())),
m_VPValue())))
return ElementCount::get(1, VF.isScalable());
return VF;
};

// Check if the main loop processes fewer than MainLoopVF elements per
// iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
// as needed.
VPlan &MainPlan = getPlanFor(MainLoopVF);
MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);

// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
Expand All @@ -4494,8 +4513,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Type *TCType = Legal->getWidestInductionType();
const SCEV *RemainingIterations = nullptr;
unsigned MaxTripCount = 0;
const SCEV *TC = vputils::getSCEVExprForVPValue(
getPlanFor(MainLoopVF).getTripCount(), PSE);
const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE);
assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
const SCEV *KnownMinTC;
bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
Expand Down Expand Up @@ -4538,29 +4556,32 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
if (!hasPlanWithVF(NextVF.Width))
continue;

ElementCount EffectiveVF =
GetEffectiveVF(getPlanFor(NextVF.Width), NextVF.Width);
// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
// vectors) or > the VF of the main loop (fixed vectors).
if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't EstimatedRuntimeVF also need updating otherwise it's inconsistent with MainLoopVF.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, EstimatedRuntimeVF should only be computed after we adjusted MainLoopVF

(NextVF.Width.isScalable() &&
ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||
(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))
if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to have a test showing the effect of this change too. It looks like this new code is also effectively disabling epilogue vectorisation since we'll discard any VF > 1.

I suspect we don't even get as far as the SkipVF code below.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be covered by existing epilogue tests; if we don't adjust the VFs here (and below) we will disable epilogue vectorization in cases where we should after narrowing interleave groups in both the main and epilogue plan, because we would compare VF = 1 from main loop to VF > 1 from the epilogue plan.

This is when vectorizing the epilogue due to interleaving the vector loop.

ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) ||
(EffectiveVF.isScalable() &&
ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) ||
(!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
ElementCount::isKnownGT(EffectiveVF, MainLoopVF)))
continue;

// If NextVF is greater than the number of remaining iterations, the
// epilogue loop would be dead. Skip such factors.
// If EffectiveVF is greater than the number of remaining iterations, the
// epilogue loop would be dead. Skip such factors. If the epilogue plan
// also has narrowed interleave groups, use the effective VF since
// the epilogue step will be reduced to its IC.
// TODO: We should also consider comparing against a scalable
// RemainingIterations when SCEV be able to evaluate non-canonical
// vscale-based expressions.
if (!ScalableRemIter) {
// Handle the case where NextVF and RemainingIterations are in different
// numerical spaces.
ElementCount EC = NextVF.Width;
if (NextVF.Width.isScalable())
EC = ElementCount::getFixed(
estimateElementCount(NextVF.Width, CM.getVScaleForTuning()));
if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations))
// Handle the case where EffectiveVF and RemainingIterations are in
// different numerical spaces.
if (EffectiveVF.isScalable())
EffectiveVF = ElementCount::getFixed(
estimateElementCount(EffectiveVF, CM.getVScaleForTuning()));
if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations))
continue;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
; RUN: opt -passes=loop-vectorize -mcpu=neoverse-v2 -S %s | FileCheck %s

target triple = "arm64-apple-macosx"

; Test that epilogue vectorization is not selected when the main vector loop
; covers all iterations after narrowInterleaveGroups reduces the effective
; step from VF * UF to UF.
define void @no_epilogue_when_narrowed_covers_all(ptr %p) {
; CHECK-LABEL: define void @no_epilogue_when_narrowed_covers_all(
; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: br label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP1]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP3]], align 8
; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP4]], align 8
; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP5]], align 8
; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP6]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: br label %[[EXIT:.*]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
entry:
br label %loop

loop:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
%p0 = getelementptr inbounds i64, ptr %p, i64 %iv
%p1 = getelementptr inbounds i64, ptr %p0, i64 1
store i64 1, ptr %p0, align 8
store i64 1, ptr %p1, align 8
%iv.next = add nuw nsw i64 %iv, 2
%ec = icmp eq i64 %iv.next, 1000
br i1 %ec, label %exit, label %loop

exit:
ret void
}