From 611246c7328ac226ac3773eadf57f92c7ef648cc Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 Aug 2025 20:17:43 +0000 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 83 ++++++-- .../AArch64/reused-scalar-repeated-in-node.ll | 187 +++++++++--------- .../AArch64/scalarization-overhead.ll | 57 ++---- .../SLPVectorizer/RISCV/vec3-base.ll | 94 +++++---- .../SLPVectorizer/X86/dot-product.ll | 94 ++------- .../SLPVectorizer/X86/horizontal-list.ll | 154 +++------------ .../X86/redux-feed-buildvector.ll | 69 ++----- .../X86/redux-feed-insertelement.ll | 22 +-- .../SLPVectorizer/X86/slp-fma-loss.ll | 24 ++- 9 files changed, 308 insertions(+), 476 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ec06a217667c2..afdf73ab58184 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22730,21 +22730,11 @@ class HorizontalReduction { /// Try to find a reduction tree. bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, ScalarEvolution &SE, const DataLayout &DL, - const TargetLibraryInfo &TLI, - DominatorTree &DT, TargetTransformInfo &TTI) { + const TargetLibraryInfo &TLI) { RdxKind = HorizontalReduction::getRdxKind(Root); if (!isVectorizable(RdxKind, Root)) return false; - // FMA reduction root - skip. - auto CheckForFMA = [&](Instruction *I) { - return RdxKind == RecurKind::FAdd && - canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI) - .isValid(); - }; - if (CheckForFMA(Root)) - return false; - // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. Type *Ty = Root->getType(); @@ -22782,7 +22772,7 @@ class HorizontalReduction { // Also, do not try to reduce const values, if the operation is not // foldable. if (!EdgeInst || Level > RecursionMaxDepth || - getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) || + getRdxKind(EdgeInst) != RdxKind || IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) || !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) || !isVectorizable(RdxKind, EdgeInst) || @@ -22901,7 +22891,8 @@ class HorizontalReduction { /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI, AssumptionCache *AC) { + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -23302,7 +23293,7 @@ class HorizontalReduction { // Estimate cost. InstructionCost ReductionCost = - getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); + getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); InstructionCost Cost = V.getTreeCost(VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); @@ -23607,7 +23598,9 @@ class HorizontalReduction { InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef ReducedVals, bool IsCmpSelMinMax, FastMathFlags FMF, - const BoUpSLP &R) { + const BoUpSLP &R, DominatorTree &DT, + const DataLayout &DL, + const TargetLibraryInfo &TLI) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = ReducedVals.front()->getType(); unsigned ReduxWidth = ReducedVals.size(); @@ -23632,6 +23625,22 @@ class HorizontalReduction { for (User *U : RdxVal->users()) { auto *RdxOp = cast(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + if (RdxKind == RecurKind::FAdd) { + InstructionCost FMACost = canConvertToFMA( + RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n"); + if (auto *I = dyn_cast(RdxVal)) { + // Also, exclude scalar fmul cost. + InstructionCost FMulCost = + TTI->getInstructionCost(I, CostKind); + LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + ScalarCost += FMACost; + continue; + } + } ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); continue; } @@ -23696,8 +23705,42 @@ class HorizontalReduction { auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); VectorType *RVecTy = getWidenedType(RType, ReduxWidth); - VectorCost += - TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + InstructionCost FMACost = InstructionCost::getInvalid(); + if (RdxKind == RecurKind::FAdd) { + // Check if the reduction operands can be converted to FMA. + SmallVector Ops; + FastMathFlags FMF; + FMF.set(); + for (Value *RdxVal : ReducedVals) { + if (!RdxVal->hasOneUse()) { + Ops.clear(); + break; + } + if (auto *FPCI = dyn_cast(RdxVal)) + FMF &= FPCI->getFastMathFlags(); + Ops.push_back(RdxVal->user_back()); + } + FMACost = canConvertToFMA( + Ops, getSameOpcode(Ops, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + // Calculate actual FMAD cost. + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, + {RVecTy, RVecTy, RVecTy}, FMF); + FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind); + + LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n"); + // Also, exclude vector fmul cost. + InstructionCost FMulCost = TTI->getArithmeticInstrCost( + Instruction::FMul, RVecTy, CostKind); + LLVM_DEBUG(dbgs() << "Minus vector FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + } + if (FMACost.isValid()) + VectorCost += FMACost; + else + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); if (RType != RedTy) { unsigned Opcode = Instruction::Trunc; if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) @@ -24357,9 +24400,9 @@ bool SLPVectorizerPass::vectorizeHorReduction( if (!isReductionCandidate(Inst)) return nullptr; HorizontalReduction HorRdx; - if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI)) + if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -24504,7 +24547,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (RedCost >= ScalarCost) return false; - return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr; }; if (Candidates.size() == 1) return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll index 2e684320ba10e..cca58d8d66f04 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll @@ -27,114 +27,111 @@ define void @test() { ; CHECK-NEXT: [[I76:%.*]] = load float, ptr poison, align 4 ; CHECK-NEXT: br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]] ; CHECK: [[BB77]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[I70]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[I68]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[I67]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[I69]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[I66]], i32 0 ; CHECK-NEXT: br label %[[BB78:.*]] ; CHECK: [[BB78]]: -; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ] -; CHECK-NEXT: [[I87:%.*]] = fmul fast float [[I85]], poison -; CHECK-NEXT: [[I88:%.*]] = fmul fast float [[I80]], poison -; CHECK-NEXT: [[I89:%.*]] = fmul fast float [[I81]], poison -; CHECK-NEXT: [[I90:%.*]] = fmul fast float [[I82]], poison -; CHECK-NEXT: [[I91:%.*]] = fmul fast float [[I84]], poison -; CHECK-NEXT: [[I92:%.*]] = fadd fast float [[I91]], [[I87]] -; CHECK-NEXT: [[I93:%.*]] = fmul fast float [[I127]], poison -; CHECK-NEXT: [[I94:%.*]] = fadd fast float [[I93]], [[I88]] -; CHECK-NEXT: [[I95:%.*]] = fmul fast float [[I131]], poison -; CHECK-NEXT: [[I96:%.*]] = fadd fast float [[I95]], [[I89]] -; CHECK-NEXT: [[I97:%.*]] = fmul fast float [[I86]], poison -; CHECK-NEXT: [[I98:%.*]] = fadd fast float [[I97]], [[I90]] -; CHECK-NEXT: [[I99:%.*]] = fadd fast float [[I92]], poison -; CHECK-NEXT: [[I100:%.*]] = fadd fast float [[I94]], poison -; CHECK-NEXT: [[I101:%.*]] = fadd fast float [[I96]], poison -; CHECK-NEXT: [[I102:%.*]] = fadd fast float [[I98]], poison -; CHECK-NEXT: [[I103]] = fadd fast float [[I99]], poison -; CHECK-NEXT: [[I104]] = fadd fast float [[I100]], poison -; CHECK-NEXT: [[I105]] = fadd fast float [[I101]], poison -; CHECK-NEXT: [[I106]] = fadd fast float [[I102]], poison -; CHECK-NEXT: [[I107:%.*]] = fmul fast float [[I85]], poison -; CHECK-NEXT: [[I108:%.*]] = fmul fast float [[I80]], poison -; CHECK-NEXT: [[I109:%.*]] = fmul fast float [[I81]], poison -; CHECK-NEXT: [[I110:%.*]] = fmul fast float [[I82]], poison -; CHECK-NEXT: [[I111:%.*]] = fmul fast float [[I84]], poison -; CHECK-NEXT: [[I112:%.*]] = fadd fast float [[I111]], [[I107]] -; CHECK-NEXT: [[I113:%.*]] = fmul fast float [[I127]], poison -; CHECK-NEXT: [[I114:%.*]] = fadd fast float [[I113]], [[I108]] -; CHECK-NEXT: [[I115:%.*]] = fmul fast float [[I131]], poison -; CHECK-NEXT: [[I116:%.*]] = fadd fast float [[I115]], [[I109]] -; CHECK-NEXT: [[I117:%.*]] = fmul fast float [[I86]], poison -; CHECK-NEXT: [[I118:%.*]] = fadd fast float [[I117]], [[I110]] -; CHECK-NEXT: [[I119:%.*]] = fadd fast float [[I112]], poison -; CHECK-NEXT: [[I120:%.*]] = fadd fast float [[I114]], poison -; CHECK-NEXT: [[I121:%.*]] = fadd fast float [[I116]], poison -; CHECK-NEXT: [[I122:%.*]] = fadd fast float [[I118]], poison -; CHECK-NEXT: [[I123]] = fadd fast float [[I119]], poison -; CHECK-NEXT: [[I124]] = fadd fast float [[I120]], poison -; CHECK-NEXT: [[I125]] = fadd fast float [[I121]], poison -; CHECK-NEXT: [[I126]] = fadd fast float [[I122]], poison +; CHECK-NEXT: [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[TMP46:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[TMP37:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP39:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP30:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[TMP53:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[TMP40:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP0]], %[[BB77]] ], [ [[TMP38:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP1]], %[[BB77]] ], [ [[TMP35:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP3]], %[[BB77]] ], [ [[TMP32:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x float> [ [[TMP4]], %[[BB77]] ], [ [[TMP29:%.*]], %[[BB78]] ] +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[TMP8]], poison +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x float> [[TMP7]], poison +; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[TMP6]], poison +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[TMP5]], poison +; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <2 x float> [[TMP8]], poison +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP13]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x float> [[TMP7]], poison +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP15]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <2 x float> [[TMP6]], poison +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP17]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[TMP5]], poison +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP19]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <2 x float> [[TMP14]], [[TMP9]] +; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <2 x float> [[TMP21]], poison +; CHECK-NEXT: [[TMP23:%.*]] = fadd fast <2 x float> [[TMP16]], [[TMP10]] +; CHECK-NEXT: [[TMP24:%.*]] = fadd fast <2 x float> [[TMP23]], poison +; CHECK-NEXT: [[TMP25:%.*]] = fadd fast <2 x float> [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[TMP26:%.*]] = fadd fast <2 x float> [[TMP25]], poison +; CHECK-NEXT: [[TMP27:%.*]] = fadd fast <2 x float> [[TMP20]], [[TMP12]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd fast <2 x float> [[TMP27]], poison +; CHECK-NEXT: [[TMP29]] = fadd fast <2 x float> [[TMP22]], poison +; CHECK-NEXT: [[TMP30]] = extractelement <2 x float> [[TMP29]], i32 1 +; CHECK-NEXT: [[TMP31]] = extractelement <2 x float> [[TMP29]], i32 0 +; CHECK-NEXT: [[TMP32]] = fadd fast <2 x float> [[TMP24]], poison +; CHECK-NEXT: [[TMP53]] = extractelement <2 x float> [[TMP32]], i32 1 +; CHECK-NEXT: [[TMP46]] = extractelement <2 x float> [[TMP32]], i32 0 +; CHECK-NEXT: [[TMP35]] = fadd fast <2 x float> [[TMP26]], poison +; CHECK-NEXT: [[TMP36]] = extractelement <2 x float> [[TMP35]], i32 1 +; CHECK-NEXT: [[TMP37]] = extractelement <2 x float> [[TMP35]], i32 0 +; CHECK-NEXT: [[TMP38]] = fadd fast <2 x float> [[TMP28]], poison +; CHECK-NEXT: [[TMP39]] = extractelement <2 x float> [[TMP38]], i32 1 +; CHECK-NEXT: [[TMP40]] = extractelement <2 x float> [[TMP38]], i32 0 ; CHECK-NEXT: [[I135:%.*]] = fmul fast float [[I85]], [[I65]] ; CHECK-NEXT: [[I128:%.*]] = fmul fast float [[I80]], [[I65]] ; CHECK-NEXT: [[I129:%.*]] = fmul fast float [[I81]], [[I65]] ; CHECK-NEXT: [[I130:%.*]] = fmul fast float [[I82]], [[I65]] ; CHECK-NEXT: [[I133:%.*]] = fmul fast float [[I84]], [[I77]] -; CHECK-NEXT: [[I134:%.*]] = fadd fast float [[I133]], [[I135]] ; CHECK-NEXT: [[I136:%.*]] = fmul fast float [[I127]], [[I77]] -; CHECK-NEXT: [[TMP51:%.*]] = fadd fast float [[I136]], [[I128]] ; CHECK-NEXT: [[I138:%.*]] = fmul fast float [[I131]], [[I77]] -; CHECK-NEXT: [[TMP52:%.*]] = fadd fast float [[I138]], [[I129]] ; CHECK-NEXT: [[I137:%.*]] = fmul fast float [[I86]], [[I77]] -; CHECK-NEXT: [[I139:%.*]] = fadd fast float [[I137]], [[I130]] -; CHECK-NEXT: [[I140:%.*]] = fadd fast float [[I134]], poison -; CHECK-NEXT: [[I141:%.*]] = fadd fast float [[TMP51]], poison -; CHECK-NEXT: [[I142:%.*]] = fadd fast float [[TMP52]], poison -; CHECK-NEXT: [[I143:%.*]] = fadd fast float [[I139]], poison -; CHECK-NEXT: [[I144:%.*]] = fadd fast float [[I140]], poison -; CHECK-NEXT: [[I145:%.*]] = fadd fast float [[I141]], poison -; CHECK-NEXT: [[I146:%.*]] = fadd fast float [[I142]], poison -; CHECK-NEXT: [[I152:%.*]] = fadd fast float [[I143]], poison -; CHECK-NEXT: [[I147:%.*]] = fmul fast float [[I85]], poison -; CHECK-NEXT: [[I148:%.*]] = fmul fast float [[I80]], poison -; CHECK-NEXT: [[I149:%.*]] = fmul fast float [[I81]], poison -; CHECK-NEXT: [[I150:%.*]] = fmul fast float [[I82]], poison -; CHECK-NEXT: [[I151:%.*]] = fmul fast float [[I84]], poison -; CHECK-NEXT: [[TMP57:%.*]] = fadd fast float [[I151]], [[I147]] -; CHECK-NEXT: [[I153:%.*]] = fmul fast float [[I127]], poison -; CHECK-NEXT: [[TMP58:%.*]] = fadd fast float [[I153]], [[I148]] -; CHECK-NEXT: [[I155:%.*]] = fmul fast float [[I131]], poison -; CHECK-NEXT: [[TMP59:%.*]] = fadd fast float [[I155]], [[I149]] -; CHECK-NEXT: [[I157:%.*]] = fmul fast float [[I86]], poison +; CHECK-NEXT: [[OP_RDX14:%.*]] = fadd fast float poison, [[I133]] +; CHECK-NEXT: [[OP_RDX15:%.*]] = fadd fast float [[OP_RDX14]], [[I135]] +; CHECK-NEXT: [[OP_RDX12:%.*]] = fadd fast float poison, [[I136]] +; CHECK-NEXT: [[OP_RDX13:%.*]] = fadd fast float [[OP_RDX12]], [[I128]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = fadd fast float poison, [[I138]] +; CHECK-NEXT: [[OP_RDX11:%.*]] = fadd fast float [[OP_RDX10]], [[I129]] +; CHECK-NEXT: [[OP_RDX8:%.*]] = fadd fast float poison, [[I137]] +; CHECK-NEXT: [[OP_RDX9:%.*]] = fadd fast float [[OP_RDX8]], [[I130]] +; CHECK-NEXT: [[TMP41:%.*]] = fmul fast <2 x float> [[TMP8]], poison +; CHECK-NEXT: [[TMP42:%.*]] = fmul fast <2 x float> [[TMP7]], poison +; CHECK-NEXT: [[TMP43:%.*]] = fmul fast <2 x float> [[TMP6]], poison +; CHECK-NEXT: [[TMP44:%.*]] = fmul fast <2 x float> [[TMP5]], poison +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x float> [[TMP41]], i32 1 +; CHECK-NEXT: [[I157:%.*]] = fadd fast float poison, [[TMP45]] +; CHECK-NEXT: [[I150:%.*]] = extractelement <2 x float> [[TMP41]], i32 0 ; CHECK-NEXT: [[TMP60:%.*]] = fadd fast float [[I157]], [[I150]] -; CHECK-NEXT: [[I159:%.*]] = fadd fast float [[TMP57]], poison -; CHECK-NEXT: [[I160:%.*]] = fadd fast float [[TMP58]], poison -; CHECK-NEXT: [[I161:%.*]] = fadd fast float [[TMP59]], poison -; CHECK-NEXT: [[I162:%.*]] = fadd fast float [[TMP60]], poison -; CHECK-NEXT: [[I163:%.*]] = fadd fast float [[I159]], poison -; CHECK-NEXT: [[I164:%.*]] = fadd fast float [[I160]], poison -; CHECK-NEXT: [[I165:%.*]] = fadd fast float [[I161]], poison -; CHECK-NEXT: [[I166:%.*]] = fadd fast float [[I162]], poison +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x float> [[TMP42]], i32 1 +; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast float poison, [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x float> [[TMP42]], i32 0 +; CHECK-NEXT: [[OP_RDX5:%.*]] = fadd fast float [[OP_RDX4]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <2 x float> [[TMP43]], i32 1 +; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float poison, [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <2 x float> [[TMP43]], i32 0 +; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast float [[OP_RDX2]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <2 x float> [[TMP44]], i32 0 +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float poison, [[TMP51]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <2 x float> [[TMP44]], i32 1 +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP52]] ; CHECK-NEXT: br i1 poison, label %[[BB78]], label %[[BB167]] ; CHECK: [[BB167]]: -; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[I166]], %[[BB78]] ] -; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I165]], %[[BB78]] ] -; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I164]], %[[BB78]] ] -; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[I163]], %[[BB78]] ] -; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[I152]], %[[BB78]] ] -; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[I146]], %[[BB78]] ] -; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[I145]], %[[BB78]] ] -; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[I144]], %[[BB78]] ] -; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[I126]], %[[BB78]] ] -; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I125]], %[[BB78]] ] -; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[I124]], %[[BB78]] ] -; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I123]], %[[BB78]] ] -; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[I106]], %[[BB78]] ] -; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[I105]], %[[BB78]] ] -; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[I104]], %[[BB78]] ] -; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[I103]], %[[BB78]] ] +; CHECK-NEXT: [[I168:%.*]] = phi nsz float [ [[I76]], %[[BB64]] ], [ [[OP_RDX1]], %[[BB78]] ] +; CHECK-NEXT: [[I169:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX3]], %[[BB78]] ] +; CHECK-NEXT: [[I170:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[OP_RDX5]], %[[BB78]] ] +; CHECK-NEXT: [[I171:%.*]] = phi nsz float [ [[I75]], %[[BB64]] ], [ [[TMP60]], %[[BB78]] ] +; CHECK-NEXT: [[I172:%.*]] = phi nsz float [ [[I74]], %[[BB64]] ], [ [[OP_RDX9]], %[[BB78]] ] +; CHECK-NEXT: [[I173:%.*]] = phi nsz float [ [[I73]], %[[BB64]] ], [ [[OP_RDX11]], %[[BB78]] ] +; CHECK-NEXT: [[TMP34:%.*]] = phi nsz float [ [[I72]], %[[BB64]] ], [ [[OP_RDX13]], %[[BB78]] ] +; CHECK-NEXT: [[I175:%.*]] = phi nsz float [ [[I71]], %[[BB64]] ], [ [[OP_RDX15]], %[[BB78]] ] +; CHECK-NEXT: [[I176:%.*]] = phi nsz float [ [[I70]], %[[BB64]] ], [ [[TMP40]], %[[BB78]] ] +; CHECK-NEXT: [[I177:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP36]], %[[BB78]] ] +; CHECK-NEXT: [[I178:%.*]] = phi nsz float [ [[I69]], %[[BB64]] ], [ [[TMP53]], %[[BB78]] ] +; CHECK-NEXT: [[I179:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP30]], %[[BB78]] ] +; CHECK-NEXT: [[I180:%.*]] = phi nsz float [ poison, %[[BB64]] ], [ [[TMP39]], %[[BB78]] ] +; CHECK-NEXT: [[I181:%.*]] = phi nsz float [ [[I68]], %[[BB64]] ], [ [[TMP37]], %[[BB78]] ] +; CHECK-NEXT: [[TMP33:%.*]] = phi nsz float [ [[I67]], %[[BB64]] ], [ [[TMP46]], %[[BB78]] ] +; CHECK-NEXT: [[I183:%.*]] = phi nsz float [ [[I66]], %[[BB64]] ], [ [[TMP31]], %[[BB78]] ] ; CHECK-NEXT: store float [[TMP33]], ptr poison, align 1 ; CHECK-NEXT: store float [[TMP34]], ptr poison, align 1 ; CHECK-NEXT: br i1 poison, label %[[BB186:.*]], label %[[BB184:.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 8093285ad8717..5fb468c0b0962 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -8,56 +8,35 @@ define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 -; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 -; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 -; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[ARG3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x float> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[TMP7]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] -; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] -; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x float> [ [[TMP8]], [[BB:%.*]] ] +; CHECK-NEXT: [[VAL16:%.*]] = extractelement <4 x float> [[TMP8]], i32 2 ; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 +; CHECK-NEXT: [[VAL17:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 ; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] -; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] -; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] -; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP9]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float -; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 -; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 -; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float -; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 -; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 -; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float -; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 -; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 -; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float -; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] -; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] -; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] -; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] -; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] -; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] -; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] -; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] -; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] -; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] -; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = uitofp <4 x i8> [[TMP13]] to <4 x float> +; CHECK-NEXT: [[TMP15:%.*]] = fsub fast <4 x float> [[TMP14]], [[TMP3]] +; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <4 x float> [[TMP15]], [[TMP12]] +; CHECK-NEXT: [[VAL54:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP16]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) ; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 430a46beace9a..9d2e22bb454e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -600,25 +600,34 @@ define i32 @dot_product_i32_reorder(ptr %a, ptr %b) { } define float @dot_product_fp32(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_fp32( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 -; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 -; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret float [[ADD_1]] +; NON-POW2-LABEL: @dot_product_fp32( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) +; NON-POW2-NEXT: ret float [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_fp32( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 +; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 +; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] +; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 @@ -646,25 +655,34 @@ define float @dot_product_fp32(ptr %a, ptr %b) { ; Same as above, except the reduction order has been perturbed. This ; is checking for our ability to reorder. define float @dot_product_fp32_reorder(ptr %a, ptr %b) { -; CHECK-LABEL: @dot_product_fp32_reorder( -; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 -; CHECK-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 -; CHECK-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 -; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 -; CHECK-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 -; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 -; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] -; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; CHECK-NEXT: ret float [[ADD_1]] +; NON-POW2-LABEL: @dot_product_fp32_reorder( +; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP_A_0]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[GEP_B_0]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x float> [[TMP1]], [[TMP2]] +; NON-POW2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP3]]) +; NON-POW2-NEXT: ret float [[TMP4]] +; +; POW2-ONLY-LABEL: @dot_product_fp32_reorder( +; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i32 1 +; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 +; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load float, ptr [[GEP_A_2]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i32 0 +; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load float, ptr [[GEP_B_0]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B]], i32 1 +; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4 +; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 +; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4 +; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[L_A_0]], [[L_B_0]] +; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[L_A_1]], [[L_B_1]] +; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[L_A_2]], [[L_B_2]] +; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_1]], [[MUL_0]] +; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; POW2-ONLY-NEXT: ret float [[ADD_1]] ; %gep.a.0 = getelementptr inbounds float, ptr %a, i32 0 %l.a.0 = load float, ptr %gep.a.0, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll index 0879ec239e287..a333d162297bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -95,47 +95,12 @@ define float @dot4f32(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %pt } define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(32) %ptry) { -; SSE2-LABEL: @dot4f64_fast( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 -; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] -; SSE2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) -; SSE2-NEXT: ret double [[TMP4]] -; -; SSE4-LABEL: @dot4f64_fast( -; SSE4-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 -; SSE4-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 -; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] -; SSE4-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) -; SSE4-NEXT: ret double [[TMP4]] -; -; AVX-LABEL: @dot4f64_fast( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 -; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) -; AVX-NEXT: ret double [[TMP4]] -; -; AVX2-LABEL: @dot4f64_fast( -; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds double, ptr [[PTRX:%.*]], i64 1 -; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds double, ptr [[PTRY:%.*]], i64 1 -; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds double, ptr [[PTRX]], i64 2 -; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, ptr [[PTRY]], i64 2 -; AVX2-NEXT: [[X0:%.*]] = load double, ptr [[PTRX]], align 4 -; AVX2-NEXT: [[Y0:%.*]] = load double, ptr [[PTRY]], align 4 -; AVX2-NEXT: [[X1:%.*]] = load double, ptr [[PTRX1]], align 4 -; AVX2-NEXT: [[Y1:%.*]] = load double, ptr [[PTRY1]], align 4 -; AVX2-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] -; AVX2-NEXT: [[MUL1:%.*]] = fmul double [[X1]], [[Y1]] -; AVX2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[PTRX2]], align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[PTRY2]], align 4 -; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[MUL1]] -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0 -; AVX2-NEXT: [[DOT012:%.*]] = fadd fast double [[DOT01]], [[TMP4]] -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 -; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast double [[DOT012]], [[TMP5]] -; AVX2-NEXT: ret double [[DOT0123]] +; CHECK-LABEL: @dot4f64_fast( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; CHECK-NEXT: ret double [[TMP4]] ; %ptrx1 = getelementptr inbounds double, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds double, ptr %ptry, i64 1 @@ -162,47 +127,12 @@ define double @dot4f64_fast(ptr dereferenceable(32) %ptrx, ptr dereferenceable(3 } define float @dot4f32_fast(ptr dereferenceable(16) %ptrx, ptr dereferenceable(16) %ptry) { -; SSE2-LABEL: @dot4f32_fast( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 -; SSE2-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SSE2-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; SSE2-NEXT: ret float [[TMP4]] -; -; SSE4-LABEL: @dot4f32_fast( -; SSE4-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 -; SSE4-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 -; SSE4-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SSE4-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; SSE4-NEXT: ret float [[TMP4]] -; -; AVX-LABEL: @dot4f32_fast( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 -; AVX-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) -; AVX-NEXT: ret float [[TMP4]] -; -; AVX2-LABEL: @dot4f32_fast( -; AVX2-NEXT: [[PTRX1:%.*]] = getelementptr inbounds float, ptr [[PTRX:%.*]], i64 1 -; AVX2-NEXT: [[PTRY1:%.*]] = getelementptr inbounds float, ptr [[PTRY:%.*]], i64 1 -; AVX2-NEXT: [[PTRX2:%.*]] = getelementptr inbounds float, ptr [[PTRX]], i64 2 -; AVX2-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, ptr [[PTRY]], i64 2 -; AVX2-NEXT: [[X0:%.*]] = load float, ptr [[PTRX]], align 4 -; AVX2-NEXT: [[Y0:%.*]] = load float, ptr [[PTRY]], align 4 -; AVX2-NEXT: [[X1:%.*]] = load float, ptr [[PTRX1]], align 4 -; AVX2-NEXT: [[Y1:%.*]] = load float, ptr [[PTRY1]], align 4 -; AVX2-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] -; AVX2-NEXT: [[MUL1:%.*]] = fmul float [[X1]], [[Y1]] -; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[PTRX2]], align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[PTRY2]], align 4 -; AVX2-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[MUL1]] -; AVX2-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; AVX2-NEXT: [[DOT012:%.*]] = fadd fast float [[DOT01]], [[TMP4]] -; AVX2-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; AVX2-NEXT: [[DOT0123:%.*]] = fadd fast float [[DOT012]], [[TMP5]] -; AVX2-NEXT: ret float [[DOT0123]] +; CHECK-LABEL: @dot4f32_fast( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTRX:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTRY:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) +; CHECK-NEXT: ret float [[TMP4]] ; %ptrx1 = getelementptr inbounds float, ptr %ptrx, i64 1 %ptry1 = getelementptr inbounds float, ptr %ptry, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index 0bbdeb55e1516..378d3b00b1158 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -73,41 +73,14 @@ define float @bazz() { ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] ; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16 -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16 -; CHECK-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]] -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4 -; CHECK-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]] -; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8 -; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8 -; CHECK-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]] -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]] -; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4 -; CHECK-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4 -; CHECK-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] ; CHECK-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; CHECK-NEXT: ret float [[OP_RDX1]] ; @@ -116,41 +89,14 @@ define float @bazz() { ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[MUL4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[MUL4_1:%.*]] = fmul fast float [[TMP4]], [[TMP3]] -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[MUL4_1]], [[ADD]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[MUL4_2:%.*]] = fmul fast float [[TMP6]], [[TMP5]] -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[MUL4_2]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[TMP8:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[MUL4_3:%.*]] = fmul fast float [[TMP8]], [[TMP7]] -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[MUL4_3]], [[ADD_2]] ; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 ; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float -; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 4), align 16 -; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 4), align 16 -; THRESHOLD-NEXT: [[MUL18:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL18]], [[ADD7]] -; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 5), align 4 -; THRESHOLD-NEXT: [[TMP12:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 5), align 4 -; THRESHOLD-NEXT: [[MUL18_1:%.*]] = fmul fast float [[TMP12]], [[TMP11]] -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[MUL18_1]], [[ADD19]] -; THRESHOLD-NEXT: [[TMP13:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 6), align 8 -; THRESHOLD-NEXT: [[TMP14:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 6), align 8 -; THRESHOLD-NEXT: [[MUL18_2:%.*]] = fmul fast float [[TMP14]], [[TMP13]] -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[MUL18_2]], [[ADD19_1]] -; THRESHOLD-NEXT: [[TMP15:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 7), align 4 -; THRESHOLD-NEXT: [[TMP16:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 7), align 4 -; THRESHOLD-NEXT: [[MUL18_3:%.*]] = fmul fast float [[TMP16]], [[TMP15]] -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[MUL18_3]], [[ADD19_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP4]], [[CONV]] +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[CONV6]] ; THRESHOLD-NEXT: store float [[OP_RDX1]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; @@ -202,21 +148,10 @@ define float @bazzz() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: store float [[TMP5]], ptr @res, align 4 ; CHECK-NEXT: ret float [[TMP5]] @@ -225,21 +160,10 @@ define float @bazzz() { ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: store float [[TMP5]], ptr @res, align 4 ; THRESHOLD-NEXT: ret float [[TMP5]] @@ -272,21 +196,10 @@ define i32 @foo() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; CHECK-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; CHECK-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; CHECK-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; CHECK-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; CHECK-NEXT: store i32 [[CONV4]], ptr @n, align 4 @@ -296,21 +209,10 @@ define i32 @foo() { ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, ptr @n, align 4 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, ptr @arr, align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, ptr @arr1, align 16 -; THRESHOLD-NEXT: [[MUL:%.*]] = fmul fast float [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[TMP11:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 1), align 4 -; THRESHOLD-NEXT: [[MUL_1:%.*]] = fmul fast float [[TMP11]], [[TMP3]] -; THRESHOLD-NEXT: [[TMP12:%.*]] = fadd fast float [[MUL_1]], [[MUL]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 2), align 8 -; THRESHOLD-NEXT: [[MUL_2:%.*]] = fmul fast float [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP8:%.*]] = fadd fast float [[MUL_2]], [[TMP12]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[TMP10:%.*]] = load float, ptr getelementptr inbounds ([20 x float], ptr @arr1, i64 0, i64 3), align 4 -; THRESHOLD-NEXT: [[MUL_3:%.*]] = fmul fast float [[TMP10]], [[TMP9]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast float [[MUL_3]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @arr, align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr @arr1, align 16 +; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast float [[CONV]], [[TMP4]] ; THRESHOLD-NEXT: [[CONV4:%.*]] = fptosi float [[TMP5]] to i32 ; THRESHOLD-NEXT: store i32 [[CONV4]], ptr @n, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index 45279296d296a..8ba2859add9e0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -11,66 +11,23 @@ define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 -; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8 -; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8 -; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], [[LD1_0]] -; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 16 -; CHECK-NEXT: [[LD2_0:%.*]] = load double, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[MUL2_0:%.*]] = fmul fast double [[LD2_0]], [[LD1_0]] -; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3 -; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8 -; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1 -; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8 -; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], [[LD1_1]] -; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]] -; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 17 -; CHECK-NEXT: [[LD2_1:%.*]] = load double, ptr [[GEP2_1]], align 8 -; CHECK-NEXT: [[MUL2_1:%.*]] = fmul fast double [[LD2_1]], [[LD1_1]] -; CHECK-NEXT: [[RDX2_0:%.*]] = fadd fast double [[MUL2_0]], [[MUL2_1]] -; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5 -; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8 -; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2 -; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 18 -; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7 -; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8 -; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9 -; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8 -; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11 -; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[LD1_2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[LD1_3]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[LD1_4]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[LD1_5]], i32 3 +; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8 +; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20 +; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> , <15 x double> poison) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP4]] +; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP12]]) ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x double>, ptr [[GEP2_2]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13 -; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8 -; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6 -; CHECK-NEXT: [[LD0_6:%.*]] = load double, ptr [[GEP0_6]], align 8 -; CHECK-NEXT: [[MUL1_6:%.*]] = fmul fast double [[LD0_6]], [[LD1_6]] -; CHECK-NEXT: [[GEP2_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 22 -; CHECK-NEXT: [[LD2_6:%.*]] = load double, ptr [[GEP2_6]], align 8 -; CHECK-NEXT: [[MUL2_6:%.*]] = fmul fast double [[LD2_6]], [[LD1_6]] -; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15 -; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8 -; CHECK-NEXT: [[GEP0_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 7 -; CHECK-NEXT: [[LD0_7:%.*]] = load double, ptr [[GEP0_7]], align 8 -; CHECK-NEXT: [[MUL1_7:%.*]] = fmul fast double [[LD0_7]], [[LD1_7]] -; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast double [[TMP10]], [[MUL1_6]] -; CHECK-NEXT: [[OP_RDX4:%.*]] = fadd fast double [[MUL1_7]], [[RDX1_0]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast double [[OP_RDX3]], [[OP_RDX4]] -; CHECK-NEXT: [[GEP2_7:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 23 -; CHECK-NEXT: [[LD2_7:%.*]] = load double, ptr [[GEP2_7]], align 8 -; CHECK-NEXT: [[MUL2_7:%.*]] = fmul fast double [[LD2_7]], [[LD1_7]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x double> [[TMP5]], [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP11]], [[MUL2_6]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast double [[MUL2_7]], [[RDX2_0]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast double [[OP_RDX]], [[OP_RDX1]] ; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll index 33c281d3f0166..f0272d591f0c3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll @@ -6,25 +6,9 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @rdx_feeds_single_insert(<2 x double> %v, ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @rdx_feeds_single_insert( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LD0_0:%.*]] = load double, ptr [[ARG1:%.*]], align 8 -; CHECK-NEXT: [[MUL1_0:%.*]] = fmul fast double [[LD0_0]], 1.000000e+01 -; CHECK-NEXT: [[GEP0_1:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 1 -; CHECK-NEXT: [[LD0_1:%.*]] = load double, ptr [[GEP0_1]], align 8 -; CHECK-NEXT: [[MUL1_1:%.*]] = fmul fast double [[LD0_1]], 1.100000e+01 -; CHECK-NEXT: [[RDX1_0:%.*]] = fadd fast double [[MUL1_0]], [[MUL1_1]] -; CHECK-NEXT: [[GEP0_2:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP0_2]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <4 x double> [[TMP0]], -; CHECK-NEXT: [[GEP0_6:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 6 -; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP1]]) -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[GEP0_6]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[RDX1_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast double [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <8 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP1]]) ; CHECK-NEXT: [[I:%.*]] = insertelement <2 x double> [[V:%.*]], double [[TMP2]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll index 359c24b00e92e..a70a1f67354c2 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4 ; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX -; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2 ; This test checks for a case when a horizontal reduction of floating-point ; adds may look profitable, but is not because it eliminates generation of @@ -34,6 +34,19 @@ define void @hr() { ; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; AVX: exit: ; AVX-NEXT: ret void +; +; AVX2-LABEL: @hr( +; AVX2-NEXT: br label [[LOOP:%.*]] +; AVX2: loop: +; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] +; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) +; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] +; AVX2: exit: +; AVX2-NEXT: ret void ; br label %loop @@ -80,6 +93,15 @@ define double @hr_or_mul() { ; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]] ; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]] ; AVX-NEXT: ret double [[ADD3]] +; +; AVX2-LABEL: @hr_or_mul( +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]] +; AVX2-NEXT: ret double [[OP_RDX]] ; %cvt0 = uitofp i16 3 to double %mul0 = fmul fast double 7.000000e+00, %cvt0 From 5bbf933ff0236d13b06b13f2d0e0c4289f64111b Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 8 Aug 2025 20:26:50 +0000 Subject: [PATCH 2/2] Fix formatting Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index afdf73ab58184..9d5b73fc85779 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23720,8 +23720,8 @@ class HorizontalReduction { FMF &= FPCI->getFastMathFlags(); Ops.push_back(RdxVal->user_back()); } - FMACost = canConvertToFMA( - Ops, getSameOpcode(Ops, TLI), DT, DL, *TTI, TLI); + FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL, + *TTI, TLI); if (FMACost.isValid()) { // Calculate actual FMAD cost. IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, @@ -23732,7 +23732,8 @@ class HorizontalReduction { // Also, exclude vector fmul cost. InstructionCost FMulCost = TTI->getArithmeticInstrCost( Instruction::FMul, RVecTy, CostKind); - LLVM_DEBUG(dbgs() << "Minus vector FMul cost: " << FMulCost << "\n"); + LLVM_DEBUG(dbgs() + << "Minus vector FMul cost: " << FMulCost << "\n"); FMACost -= FMulCost; } }