diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 040e2dafb56a6..e18ff6fed7eab 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23842,7 +23842,8 @@ class HorizontalReduction { /// Attempt to vectorize the tree found by matchAssociativeReduction. Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, - const TargetLibraryInfo &TLI, AssumptionCache *AC) { + const TargetLibraryInfo &TLI, AssumptionCache *AC, + DominatorTree &DT) { constexpr unsigned RegMaxNumber = 4; constexpr unsigned RedValsMaxNumber = 128; // If there are a sufficient number of reduction values, reduce @@ -24241,7 +24242,7 @@ class HorizontalReduction { // Estimate cost. InstructionCost ReductionCost = - getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V); + getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI); InstructionCost Cost = V.getTreeCost(VL, ReductionCost); LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for reduction\n"); @@ -24546,7 +24547,9 @@ class HorizontalReduction { InstructionCost getReductionCost(TargetTransformInfo *TTI, ArrayRef ReducedVals, bool IsCmpSelMinMax, FastMathFlags FMF, - const BoUpSLP &R) { + const BoUpSLP &R, DominatorTree &DT, + const DataLayout &DL, + const TargetLibraryInfo &TLI) { TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ScalarTy = ReducedVals.front()->getType(); unsigned ReduxWidth = ReducedVals.size(); @@ -24571,6 +24574,22 @@ class HorizontalReduction { for (User *U : RdxVal->users()) { auto *RdxOp = cast(U); if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) { + if (RdxKind == RecurKind::FAdd) { + InstructionCost FMACost = canConvertToFMA( + RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI); + if (FMACost.isValid()) { + LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n"); + if (auto *I = dyn_cast(RdxVal)) { + // Also, exclude scalar fmul cost. + InstructionCost FMulCost = + TTI->getInstructionCost(I, CostKind); + LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + ScalarCost += FMACost; + continue; + } + } ScalarCost += TTI->getInstructionCost(RdxOp, CostKind); continue; } @@ -24635,8 +24654,43 @@ class HorizontalReduction { auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( std::make_pair(RedTy, true)); VectorType *RVecTy = getWidenedType(RType, ReduxWidth); - VectorCost += - TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); + InstructionCost FMACost = InstructionCost::getInvalid(); + if (RdxKind == RecurKind::FAdd) { + // Check if the reduction operands can be converted to FMA. + SmallVector Ops; + FastMathFlags FMF; + FMF.set(); + for (Value *RdxVal : ReducedVals) { + if (!RdxVal->hasOneUse()) { + Ops.clear(); + break; + } + if (auto *FPCI = dyn_cast(RdxVal)) + FMF &= FPCI->getFastMathFlags(); + Ops.push_back(RdxVal->user_back()); + } + FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL, + *TTI, TLI); + if (FMACost.isValid()) { + // Calculate actual FMAD cost. + IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy, + {RVecTy, RVecTy, RVecTy}, FMF); + FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind); + + LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n"); + // Also, exclude vector fmul cost. + InstructionCost FMulCost = TTI->getArithmeticInstrCost( + Instruction::FMul, RVecTy, CostKind); + LLVM_DEBUG(dbgs() + << "Minus vector FMul cost: " << FMulCost << "\n"); + FMACost -= FMulCost; + } + } + if (FMACost.isValid()) + VectorCost += FMACost; + else + VectorCost += + TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind); if (RType != RedTy) { unsigned Opcode = Instruction::Trunc; if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) @@ -25304,7 +25358,7 @@ bool SLPVectorizerPass::vectorizeHorReduction( HorizontalReduction HorRdx; if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI)) return nullptr; - return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC); + return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT); }; auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { if (TryOperandsAsNewSeeds && FutureSeed == Root) { @@ -25449,7 +25503,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { if (RedCost >= ScalarCost) return false; - return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr; + return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr; }; if (Candidates.size() == 1) return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll index 9cc6b8739b20f..9d2e22bb454e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll @@ -709,34 +709,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) { define double @dot_product_fp64(ptr %a, ptr %b) { -; NON-POW2-LABEL: @dot_product_fp64( -; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]] -; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]]) -; NON-POW2-NEXT: ret double [[TMP4]] -; -; POW2-ONLY-LABEL: @dot_product_fp64( -; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1 -; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4 -; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 -; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 -; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1 -; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4 -; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 -; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 -; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]] -; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]] -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]] -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret double [[ADD_1]] +; CHECK-LABEL: @dot_product_fp64( +; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1 +; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2 +; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4 +; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0 +; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4 +; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1 +; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4 +; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2 +; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4 +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]] +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]] +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]] +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret double [[ADD_1]] ; %gep.a.0 = getelementptr inbounds double, ptr %a, i32 0 %l.a.0 = load double, ptr %gep.a.0, align 4 @@ -793,21 +784,13 @@ entry: } define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) { -; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec( -; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0 -; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01) -; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]]) -; NON-POW2-NEXT: ret float [[TMP5]] -; -; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec( -; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 -; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] -; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] -; POW2-ONLY-NEXT: ret float [[ADD_1]] +; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec( +; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01 +; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01 +; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]] +; CHECK-NEXT: ret float [[ADD_1]] ; %mul.0 = fmul fast float %a, 10.0 %mul.1 = fmul fast float %b, 10.0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index 1922e935cee4b..f921278cdecf3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -10,19 +10,24 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 ; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison) -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]]) -; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8 +; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20 +; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> , <15 x double> poison) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]]) +; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]]) +; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true)) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll index 82fb5a46fee7c..8b65461028fb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4 ; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX -; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2 ; This test checks for a case when a horizontal reduction of floating-point ; adds may look profitable, but is not because it eliminates generation of @@ -26,13 +26,27 @@ define void @hr() { ; AVX: loop: ; AVX-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ] ; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double -; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] -; AVX-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) -; AVX-NEXT: [[ADD3]] = fadd fast double [[TMP3]], [[PHI0]] +; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]] +; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]] +; AVX-NEXT: [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]] +; AVX-NEXT: [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]] +; AVX-NEXT: [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]] ; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] ; AVX: exit: ; AVX-NEXT: ret void +; +; AVX2-LABEL: @hr( +; AVX2-NEXT: br label [[LOOP:%.*]] +; AVX2: loop: +; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ] +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[CVT0]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]] +; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]]) +; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]] +; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]] +; AVX2: exit: +; AVX2-NEXT: ret void ; br label %loop @@ -70,12 +84,24 @@ define double @hr_or_mul() { ; ; AVX-LABEL: @hr_or_mul( ; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double -; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] -; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; AVX-NEXT: [[TMP4:%.*]] = fmul fast double 7.000000e+00, [[CVT0]] ; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[TMP4]], [[CVT0]] -; AVX-NEXT: ret double [[ADD3]] +; AVX-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]] +; AVX-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD3]] +; AVX-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]] +; AVX-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]] +; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]] +; AVX-NEXT: [[ADD4:%.*]] = fadd fast double [[MUL3]], [[ADD2]] +; AVX-NEXT: ret double [[ADD4]] +; +; AVX2-LABEL: @hr_or_mul( +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> , [[TMP2]] +; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]]) +; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]] +; AVX2-NEXT: ret double [[OP_RDX]] ; %cvt0 = uitofp i16 3 to double %mul0 = fmul fast double 7.000000e+00, %cvt0