diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 5f19269e92e4f..7652ee37fe686 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -609,8 +609,8 @@ class LoopVectorizationPlanner { /// \return The most profitable vectorization factor and the cost of that VF /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if /// epilogue vectorization is not supported for the loop. - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MainLoopVF, unsigned IC); + VectorizationFactor selectEpilogueVectorizationFactor(ElementCount MainLoopVF, + unsigned IC); /// Emit remarks for recipes with invalid costs in the available VPlans. void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 0ba819844416e..06b900fcb85a4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4441,7 +4441,7 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( } VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, unsigned IC) { + ElementCount MainLoopVF, unsigned IC) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); @@ -4485,6 +4485,25 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( return Result; } + // Check if a plan's vector loop processes fewer iterations than VF (e.g. when + // interleave groups have been narrowed) narrowInterleaveGroups) and return + // the adjusted, effective VF. + using namespace VPlanPatternMatch; + auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount { + auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock(); + if (match(&Exiting->back(), + m_BranchOnCount(m_Add(m_CanonicalIV(), m_Specific(&Plan.getUF())), + m_VPValue()))) + return ElementCount::get(1, VF.isScalable()); + return VF; + }; + + // Check if the main loop processes fewer than MainLoopVF elements per + // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF + // as needed. + VPlan &MainPlan = getPlanFor(MainLoopVF); + MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF); + // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. @@ -4494,8 +4513,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(MainLoopVF).getTripCount(), PSE); + const SCEV *TC = vputils::getSCEVExprForVPValue(MainPlan.getTripCount(), PSE); assert(!isa(TC) && "Trip count SCEV must be computable"); const SCEV *KnownMinTC; bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale())); @@ -4538,29 +4556,32 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( if (!hasPlanWithVF(NextVF.Width)) continue; + ElementCount EffectiveVF = + GetEffectiveVF(getPlanFor(NextVF.Width), NextVF.Width); // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable // vectors) or > the VF of the main loop (fixed vectors). - if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) || - (NextVF.Width.isScalable() && - ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) || - (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() && - ElementCount::isKnownGT(NextVF.Width, MainLoopVF))) + if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownGE(EffectiveVF, EstimatedRuntimeVF)) || + (EffectiveVF.isScalable() && + ElementCount::isKnownGE(EffectiveVF, MainLoopVF)) || + (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() && + ElementCount::isKnownGT(EffectiveVF, MainLoopVF))) continue; - // If NextVF is greater than the number of remaining iterations, the - // epilogue loop would be dead. Skip such factors. + // If EffectiveVF is greater than the number of remaining iterations, the + // epilogue loop would be dead. Skip such factors. If the epilogue plan + // also has narrowed interleave groups, use the effective VF since + // the epilogue step will be reduced to its IC. // TODO: We should also consider comparing against a scalable // RemainingIterations when SCEV be able to evaluate non-canonical // vscale-based expressions. if (!ScalableRemIter) { - // Handle the case where NextVF and RemainingIterations are in different - // numerical spaces. - ElementCount EC = NextVF.Width; - if (NextVF.Width.isScalable()) - EC = ElementCount::getFixed( - estimateElementCount(NextVF.Width, CM.getVScaleForTuning())); - if (SkipVF(SE.getElementCount(TCType, EC), RemainingIterations)) + // Handle the case where EffectiveVF and RemainingIterations are in + // different numerical spaces. + if (EffectiveVF.isScalable()) + EffectiveVF = ElementCount::getFixed( + estimateElementCount(EffectiveVF, CM.getVScaleForTuning())); + if (SkipVF(SE.getElementCount(TCType, EffectiveVF), RemainingIterations)) continue; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll new file mode 100644 index 0000000000000..e3a7bb8c6a17d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6 +; RUN: opt -passes=loop-vectorize -mcpu=neoverse-v2 -S %s | FileCheck %s + +target triple = "arm64-apple-macosx" + +; Test that epilogue vectorization is not selected when the main vector loop +; covers all iterations after narrowInterleaveGroups reduces the effective +; step from VF * UF to UF. +define void @no_epilogue_when_narrowed_covers_all(ptr %p) { +; CHECK-LABEL: define void @no_epilogue_when_narrowed_covers_all( +; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 6 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]] +; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP3]], align 8 +; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP4]], align 8 +; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP5]], align 8 +; CHECK-NEXT: store <2 x i64> splat (i64 1), ptr [[TMP6]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %p0 = getelementptr inbounds i64, ptr %p, i64 %iv + %p1 = getelementptr inbounds i64, ptr %p0, i64 1 + store i64 1, ptr %p0, align 8 + store i64 1, ptr %p1, align 8 + %iv.next = add nuw nsw i64 %iv, 2 + %ec = icmp eq i64 %iv.next, 1000 + br i1 %ec, label %exit, label %loop + +exit: + ret void +}