diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index aee9f0d36b3f0..82774de985d61 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8851,6 +8851,28 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, return SDValue(); } +static SDValue determineFloatSign(SDValue N, SelectionDAG &DAG, bool Positive) { + SDLoc DL(N); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT VT = N->getValueType(0); + EVT IntVT = VT.changeTypeToInteger(); + SDValue NTrunc = N; + if (!TLI.isTypeLegal(IntVT)) { + EVT FloatVT = VT.changeElementType(*DAG.getContext(), MVT::f32); + IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32); + NTrunc = DAG.getNode(ISD::FP_ROUND, DL, FloatVT, N, + DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)); + } + EVT CCVT = + TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), IntVT); + + // FIXME: how to support 16bit/8bit targets? + SDValue IntN = DAG.getNode(ISD::BITCAST, DL, IntVT, NTrunc); + + return DAG.getSetCC(DL, CCVT, IntN, DAG.getConstant(0, DL, IntVT), + Positive ? ISD::SETGE : ISD::SETLT); +} + SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const { if (SDValue Expanded = expandVectorNaryOpBySplitting(N, DAG)) @@ -8868,54 +8890,60 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, // First, implement comparison not propagating NaN. If no native fmin or fmax // available, use plain select with setcc instead. SDValue MinMax; - unsigned CompOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; - unsigned CompOpc = IsMax ? ISD::FMAXNUM : ISD::FMINNUM; - - // FIXME: We should probably define fminnum/fmaxnum variants with correct - // signed zero behavior. - bool MinMaxMustRespectOrderedZero = false; - - if (isOperationLegalOrCustom(CompOpcIeee, VT)) { - MinMax = DAG.getNode(CompOpcIeee, DL, VT, LHS, RHS, Flags); - MinMaxMustRespectOrderedZero = true; - } else if (isOperationLegalOrCustom(CompOpc, VT)) { - MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags); - } else { - if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) - return DAG.UnrollVectorOp(N); - - // NaN (if exists) will be propagated later, so orderness doesn't matter. - SDValue Compare = - DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETOGT : ISD::SETOLT); - MinMax = DAG.getSelect(DL, VT, Compare, LHS, RHS, Flags); + unsigned MinMaxOpcIeee = IsMax ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE; + // TODO: Add ISD::FMAXNUM or ISD::FMINNUM when we are sure that they have + // the same behavior on all platforms. + unsigned MinMaxOpcNum2019 = IsMax ? ISD::FMAXIMUMNUM : ISD::FMINIMUMNUM; + unsigned MinMaxOpc = ISD::DELETED_NODE; + + if (isOperationLegalOrCustom(MinMaxOpcIeee, VT)) + MinMaxOpc = MinMaxOpcIeee; + else if (isOperationLegalOrCustom(MinMaxOpcNum2019, VT)) + MinMaxOpc = MinMaxOpcNum2019; + if (MinMaxOpc != ISD::DELETED_NODE) { + // TODO: we have another choice for NaNs + // if RHS is NaN; then LHS = RHS; fi + // if LHS is NaN; then RHS = LHS; fi + // MinMax = MinMaxOpc (LHS, RHS) + // With this we can keep the payloads of NaNs and avoid load/store, + // while it may cause worse performance on platforms with advanced + // immediate loading support. + MinMax = DAG.getNode(MinMaxOpc, DL, VT, LHS, RHS, Flags); + if (!N->getFlags().hasNoNaNs() && + (!DAG.isKnownNeverNaN(RHS) || !DAG.isKnownNeverNaN(LHS))) { + ConstantFP *FPNaN = ConstantFP::get( + *DAG.getContext(), APFloat::getNaN(VT.getFltSemantics())); + MinMax = + DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO), + DAG.getConstantFP(*FPNaN, DL, VT), MinMax, Flags); + } + return MinMax; } - // Propagate any NaN of both operands - if (!N->getFlags().hasNoNaNs() && - (!DAG.isKnownNeverNaN(RHS) || !DAG.isKnownNeverNaN(LHS))) { - ConstantFP *FPNaN = ConstantFP::get(*DAG.getContext(), - APFloat::getNaN(VT.getFltSemantics())); - MinMax = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, LHS, RHS, ISD::SETUO), - DAG.getConstantFP(*FPNaN, DL, VT), MinMax, Flags); - } + if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + return DAG.UnrollVectorOp(N); + + if (!Flags.hasNoNaNs() && !DAG.isKnownNeverNaN(RHS)) + LHS = DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, RHS, RHS, ISD::SETUO), + RHS, LHS, Flags); + MinMax = DAG.getSelect( + DL, VT, + DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETUGT : ISD::SETULT), LHS, + RHS, Flags); // fminimum/fmaximum requires -0.0 less than +0.0 - if (!MinMaxMustRespectOrderedZero && !N->getFlags().hasNoSignedZeros() && - !DAG.isKnownNeverZeroFloat(RHS) && !DAG.isKnownNeverZeroFloat(LHS)) { - SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax, - DAG.getConstantFP(0.0, DL, VT), ISD::SETOEQ); - SDValue TestZero = - DAG.getTargetConstant(IsMax ? fcPosZero : fcNegZero, DL, MVT::i32); - SDValue LCmp = DAG.getSelect( - DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, LHS, TestZero), LHS, - MinMax, Flags); - SDValue RCmp = DAG.getSelect( - DL, VT, DAG.getNode(ISD::IS_FPCLASS, DL, CCVT, RHS, TestZero), RHS, - LCmp, Flags); - MinMax = DAG.getSelect(DL, VT, IsZero, RCmp, MinMax, Flags); - } - - return MinMax; + bool LHSNotZero = DAG.isKnownNeverZeroFloat(LHS); + bool RHSNotZero = DAG.isKnownNeverZeroFloat(RHS); + if (Flags.hasNoSignedZeros() || LHSNotZero || RHSNotZero) { + return MinMax; + } + SDValue IsZero = DAG.getSetCC(DL, CCVT, MinMax, + DAG.getConstantFP(0.0, DL, VT), ISD::SETEQ); + + SDValue RetZero = + DAG.getSelect(DL, VT, determineFloatSign(LHS, DAG, IsMax ? true : false), + LHS, MinMax, Flags); + return DAG.getSelect(DL, VT, IsZero, RetZero, MinMax, Flags); } SDValue TargetLowering::expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *Node, diff --git a/llvm/test/CodeGen/Mips/fp-maximum-minimum.ll b/llvm/test/CodeGen/Mips/fp-maximum-minimum.ll new file mode 100644 index 0000000000000..2fe353002fcf9 --- /dev/null +++ b/llvm/test/CodeGen/Mips/fp-maximum-minimum.ll @@ -0,0 +1,960 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=mipsisa32r6 < %s | FileCheck %s --check-prefix=MIPS32R6 +; RUN: llc --mtriple=mips64 -mattr=+mips64r2 < %s | FileCheck %s --check-prefix=MIPS64R2 +; RUN: llc --mtriple=mips64 -mattr=+mips64 < %s | FileCheck %s --check-prefix=MIPS64 +; RUN: llc --mtriple=mips -mattr=+mips32r2 < %s | FileCheck %s --check-prefix=MIPS32R2 +; RUN: llc --mtriple=mips -mattr=+mips32 < %s | FileCheck %s --check-prefix=MIPS32 + +declare float @llvm.maximum.f32(float, float) +declare double @llvm.maximum.f64(double, double) +declare float @llvm.minimum.f32(float, float) +declare double @llvm.minimum.f64(double, double) + +define float @maximum_float(float %x, float %y) { +; MIPS32R6-LABEL: maximum_float: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: max.s $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.s $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI0_0) +; MIPS32R6-NEXT: lwc1 $f2, %lo($CPI0_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.s $f0, $f1, $f2 +; +; MIPS64R2-LABEL: maximum_float: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.un.s $f13, $f13 +; MIPS64R2-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ole.s $f12, $f13 +; MIPS64R2-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS64R2-NEXT: mfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.s $f1, $f0 +; MIPS64R2-NEXT: movz.s $f1, $f12, $1 +; MIPS64R2-NEXT: mtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.s $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS64-LABEL: maximum_float: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.un.s $f13, $f13 +; MIPS64-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ole.s $f12, $f13 +; MIPS64-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS64-NEXT: mfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.s $f1, $f0 +; MIPS64-NEXT: movz.s $f1, $f12, $1 +; MIPS64-NEXT: mtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.s $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: maximum_float: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.un.s $f14, $f14 +; MIPS32R2-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ole.s $f12, $f14 +; MIPS32R2-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32R2-NEXT: mfc1 $1, $f12 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.s $f1, $f0 +; MIPS32R2-NEXT: movz.s $f1, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f2 +; MIPS32R2-NEXT: c.eq.s $f0, $f2 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32-LABEL: maximum_float: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.un.s $f14, $f14 +; MIPS32-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ole.s $f12, $f14 +; MIPS32-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32-NEXT: mfc1 $1, $f12 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.s $f1, $f0 +; MIPS32-NEXT: movz.s $f1, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f2 +; MIPS32-NEXT: c.eq.s $f0, $f2 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.s $f0, $f1, $fcc0 +; MIPS32R5-LABEL: maximum_float: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.un.s $f12, $f12 +; MIPS32R5-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.s $f14, $f14 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.ule.s $f12, $f0 +; MIPS32R5-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: mfc1 $1, $f12 +; MIPS32R5-NEXT: mov.s $f1, $f0 +; MIPS32R5-NEXT: movz.s $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.s $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.s $f0, $f1, $fcc0 + %z = call float @llvm.maximum.f32(float %x, float %y) + ret float %z +} + +define float @maximum_float_nsz(float %x, float %y) { +; MIPS32R6-LABEL: maximum_float_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: max.s $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.s $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI1_0) +; MIPS32R6-NEXT: lwc1 $f2, %lo($CPI1_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.s $f0, $f1, $f2 +; +; MIPS64R2-LABEL: maximum_float_nsz: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.un.s $f13, $f13 +; MIPS64R2-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ole.s $f12, $f13 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movf.s $f0, $f12, $fcc0 +; +; MIPS64-LABEL: maximum_float_nsz: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.un.s $f13, $f13 +; MIPS64-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ole.s $f12, $f13 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movf.s $f0, $f12, $fcc0 +; +; MIPS32R2-LABEL: maximum_float_nsz: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.un.s $f14, $f14 +; MIPS32R2-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ole.s $f12, $f14 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movf.s $f0, $f12, $fcc0 +; +; MIPS32-LABEL: maximum_float_nsz: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.un.s $f14, $f14 +; MIPS32-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ole.s $f12, $f14 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32R5-LABEL: maximum_float_nsz: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.un.s $f12, $f12 +; MIPS32R5-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.s $f14, $f14 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.ule.s $f12, $f0 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movf.s $f0, $f12, $fcc0 + %z = call nsz float @llvm.maximum.f32(float %x, float %y) + ret float %z +} + +define float @maximum_float_nnan(float %x, float %y) { +; MIPS32R6-LABEL: maximum_float_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.s $f0, $f12, $f14 +; +; MIPS64R2-LABEL: maximum_float_nnan: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.ole.s $f12, $f13 +; MIPS64R2-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS64R2-NEXT: mfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.s $f1, $f0 +; MIPS64R2-NEXT: movz.s $f1, $f12, $1 +; MIPS64R2-NEXT: mtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.s $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS64-LABEL: maximum_float_nnan: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.ole.s $f12, $f13 +; MIPS64-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS64-NEXT: mfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.s $f1, $f0 +; MIPS64-NEXT: movz.s $f1, $f12, $1 +; MIPS64-NEXT: mtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.s $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: maximum_float_nnan: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.ole.s $f12, $f14 +; MIPS32R2-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32R2-NEXT: mfc1 $1, $f12 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.s $f1, $f0 +; MIPS32R2-NEXT: movz.s $f1, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f2 +; MIPS32R2-NEXT: c.eq.s $f0, $f2 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32-LABEL: maximum_float_nnan: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.ole.s $f12, $f14 +; MIPS32-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32-NEXT: mfc1 $1, $f12 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.s $f1, $f0 +; MIPS32-NEXT: movz.s $f1, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f2 +; MIPS32-NEXT: c.eq.s $f0, $f2 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.s $f0, $f1, $fcc0 +; MIPS32R5-LABEL: maximum_float_nnan: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.ule.s $f12, $f14 +; MIPS32R5-NEXT: movf.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: mfc1 $1, $f12 +; MIPS32R5-NEXT: mov.s $f1, $f0 +; MIPS32R5-NEXT: movz.s $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.s $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.s $f0, $f1, $fcc0 + %z = call nnan float @llvm.maximum.f32(float %x, float %y) + ret float %z +} + + +define double @maximum_double(double %x, double %y) { +; MIPS32R6-LABEL: maximum_double: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: max.d $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.d $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI3_0) +; MIPS32R6-NEXT: ldc1 $f2, %lo($CPI3_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.d $f0, $f1, $f2 +; +; MIPS64R2-LABEL: maximum_double: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.un.d $f13, $f13 +; MIPS64R2-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ole.d $f12, $f13 +; MIPS64R2-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS64R2-NEXT: dmfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.d $f1, $f0 +; MIPS64R2-NEXT: movz.d $f1, $f12, $1 +; MIPS64R2-NEXT: dmtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.d $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS64-LABEL: maximum_double: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.un.d $f13, $f13 +; MIPS64-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ole.d $f12, $f13 +; MIPS64-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS64-NEXT: dmfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.d $f1, $f0 +; MIPS64-NEXT: movz.d $f1, $f12, $1 +; MIPS64-NEXT: dmtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.d $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: maximum_double: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.un.d $f14, $f14 +; MIPS32R2-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ole.d $f12, $f14 +; MIPS32R2-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32R2-NEXT: cvt.s.d $f2, $f12 +; MIPS32R2-NEXT: mfc1 $1, $f2 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.d $f2, $f0 +; MIPS32R2-NEXT: movz.d $f2, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f4 +; MIPS32R2-NEXT: mthc1 $zero, $f4 +; MIPS32R2-NEXT: c.eq.d $f0, $f4 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.d $f0, $f2, $fcc0 +; +; MIPS32-LABEL: maximum_double: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.un.d $f14, $f14 +; MIPS32-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ole.d $f12, $f14 +; MIPS32-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32-NEXT: cvt.s.d $f2, $f12 +; MIPS32-NEXT: mfc1 $1, $f2 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.d $f2, $f0 +; MIPS32-NEXT: movz.d $f2, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f4 +; MIPS32-NEXT: mtc1 $zero, $f5 +; MIPS32-NEXT: c.eq.d $f0, $f4 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.d $f0, $f2, $fcc0 +; MIPS32R5-LABEL: maximum_double: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.un.d $f12, $f12 +; MIPS32R5-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.d $f14, $f14 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.ule.d $f12, $f0 +; MIPS32R5-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: cvt.s.d $f1, $f12 +; MIPS32R5-NEXT: mfc1 $1, $f1 +; MIPS32R5-NEXT: mov.d $f1, $f0 +; MIPS32R5-NEXT: movz.d $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: mthc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.d $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.d $f0, $f1, $fcc0 + %z = call double @llvm.maximum.f64(double %x, double %y) + ret double %z +} + +define double @maximum_double_nsz(double %x, double %y) { +; MIPS32R6-LABEL: maximum_double_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: max.d $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.d $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI4_0) +; MIPS32R6-NEXT: ldc1 $f2, %lo($CPI4_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.d $f0, $f1, $f2 +; +; MIPS64R2-LABEL: maximum_double_nsz: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.un.d $f13, $f13 +; MIPS64R2-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ole.d $f12, $f13 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movf.d $f0, $f12, $fcc0 +; +; MIPS64-LABEL: maximum_double_nsz: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.un.d $f13, $f13 +; MIPS64-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ole.d $f12, $f13 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movf.d $f0, $f12, $fcc0 +; +; MIPS32R2-LABEL: maximum_double_nsz: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.un.d $f14, $f14 +; MIPS32R2-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ole.d $f12, $f14 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movf.d $f0, $f12, $fcc0 +; +; MIPS32-LABEL: maximum_double_nsz: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.un.d $f14, $f14 +; MIPS32-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ole.d $f12, $f14 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32R5-LABEL: maximum_double_nsz: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.un.d $f12, $f12 +; MIPS32R5-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.d $f14, $f14 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.ule.d $f12, $f0 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movf.d $f0, $f12, $fcc0 + %z = call nsz double @llvm.maximum.f64(double %x, double %y) + ret double %z +} + +define double @maximum_double_nnan(double %x, double %y) { +; MIPS32R6-LABEL: maximum_double_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: max.d $f0, $f12, $f14 +; +; MIPS64R2-LABEL: maximum_double_nnan: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.ole.d $f12, $f13 +; MIPS64R2-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS64R2-NEXT: dmfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.d $f1, $f0 +; MIPS64R2-NEXT: movz.d $f1, $f12, $1 +; MIPS64R2-NEXT: dmtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.d $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS64-LABEL: maximum_double_nnan: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.ole.d $f12, $f13 +; MIPS64-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS64-NEXT: dmfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.d $f1, $f0 +; MIPS64-NEXT: movz.d $f1, $f12, $1 +; MIPS64-NEXT: dmtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.d $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: maximum_double_nnan: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.ole.d $f12, $f14 +; MIPS32R2-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32R2-NEXT: cvt.s.d $f2, $f12 +; MIPS32R2-NEXT: mfc1 $1, $f2 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.d $f2, $f0 +; MIPS32R2-NEXT: movz.d $f2, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f4 +; MIPS32R2-NEXT: mthc1 $zero, $f4 +; MIPS32R2-NEXT: c.eq.d $f0, $f4 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.d $f0, $f2, $fcc0 +; +; MIPS32-LABEL: maximum_double_nnan: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.ole.d $f12, $f14 +; MIPS32-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32-NEXT: cvt.s.d $f2, $f12 +; MIPS32-NEXT: mfc1 $1, $f2 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.d $f2, $f0 +; MIPS32-NEXT: movz.d $f2, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f4 +; MIPS32-NEXT: mtc1 $zero, $f5 +; MIPS32-NEXT: c.eq.d $f0, $f4 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.d $f0, $f2, $fcc0 +; MIPS32R5-LABEL: maximum_double_nnan: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.ule.d $f12, $f14 +; MIPS32R5-NEXT: movf.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: cvt.s.d $f1, $f12 +; MIPS32R5-NEXT: mfc1 $1, $f1 +; MIPS32R5-NEXT: mov.d $f1, $f0 +; MIPS32R5-NEXT: movz.d $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: mthc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.d $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.d $f0, $f1, $fcc0 + %z = call nnan double @llvm.maximum.f64(double %x, double %y) + ret double %z +} + +define float @minimum_float(float %x, float %y) { +; MIPS32R6-LABEL: minimum_float: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.s $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI6_0) +; MIPS32R6-NEXT: lwc1 $f2, %lo($CPI6_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.s $f0, $f1, $f2 +; +; MIPS64R2-LABEL: minimum_float: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.un.s $f13, $f13 +; MIPS64R2-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ult.s $f12, $f13 +; MIPS64R2-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS64R2-NEXT: mfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.s $f1, $f0 +; MIPS64R2-NEXT: movn.s $f1, $f12, $1 +; MIPS64R2-NEXT: mtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.s $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS64-LABEL: minimum_float: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.un.s $f13, $f13 +; MIPS64-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ult.s $f12, $f13 +; MIPS64-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS64-NEXT: mfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.s $f1, $f0 +; MIPS64-NEXT: movn.s $f1, $f12, $1 +; MIPS64-NEXT: mtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.s $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: minimum_float: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.un.s $f14, $f14 +; MIPS32R2-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ult.s $f12, $f14 +; MIPS32R2-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R2-NEXT: mfc1 $1, $f12 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.s $f1, $f0 +; MIPS32R2-NEXT: movn.s $f1, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f2 +; MIPS32R2-NEXT: c.eq.s $f0, $f2 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32-LABEL: minimum_float: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.un.s $f14, $f14 +; MIPS32-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ult.s $f12, $f14 +; MIPS32-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32-NEXT: mfc1 $1, $f12 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.s $f1, $f0 +; MIPS32-NEXT: movn.s $f1, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f2 +; MIPS32-NEXT: c.eq.s $f0, $f2 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.s $f0, $f1, $fcc0 +; MIPS32R5-LABEL: minimum_float: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.un.s $f12, $f12 +; MIPS32R5-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.s $f14, $f14 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.olt.s $f12, $f0 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: mfc1 $1, $f12 +; MIPS32R5-NEXT: lui $2, 32768 +; MIPS32R5-NEXT: xor $1, $1, $2 +; MIPS32R5-NEXT: mov.s $f1, $f0 +; MIPS32R5-NEXT: movz.s $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.s $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.s $f0, $f1, $fcc0 + %z = call float @llvm.minimum.f32(float %x, float %y) + ret float %z +} + +define float @minimum_float_nsz(float %x, float %y) { +; MIPS32R6-LABEL: minimum_float_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.s $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.s $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI7_0) +; MIPS32R6-NEXT: lwc1 $f2, %lo($CPI7_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.s $f0, $f1, $f2 +; +; MIPS64R2-LABEL: minimum_float_nsz: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.un.s $f13, $f13 +; MIPS64R2-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ult.s $f12, $f13 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.s $f0, $f12, $fcc0 +; +; MIPS64-LABEL: minimum_float_nsz: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.un.s $f13, $f13 +; MIPS64-NEXT: movt.s $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ult.s $f12, $f13 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.s $f0, $f12, $fcc0 +; +; MIPS32R2-LABEL: minimum_float_nsz: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.un.s $f14, $f14 +; MIPS32R2-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ult.s $f12, $f14 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.s $f0, $f12, $fcc0 +; +; MIPS32-LABEL: minimum_float_nsz: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.un.s $f14, $f14 +; MIPS32-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ult.s $f12, $f14 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-LABEL: minimum_float_nsz: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.un.s $f12, $f12 +; MIPS32R5-NEXT: movt.s $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.s $f14, $f14 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.olt.s $f12, $f0 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 + %z = call nsz float @llvm.minimum.f32(float %x, float %y) + ret float %z +} + +define float @minimum_float_nnan(float %x, float %y) { +; MIPS32R6-LABEL: minimum_float_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.s $f0, $f12, $f14 +; +; MIPS64R2-LABEL: minimum_float_nnan: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.s $f0, $f13 +; MIPS64R2-NEXT: c.ult.s $f12, $f13 +; MIPS64R2-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS64R2-NEXT: mfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.s $f1, $f0 +; MIPS64R2-NEXT: movn.s $f1, $f12, $1 +; MIPS64R2-NEXT: mtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.s $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS64-LABEL: minimum_float_nnan: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.s $f0, $f13 +; MIPS64-NEXT: c.ult.s $f12, $f13 +; MIPS64-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS64-NEXT: mfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.s $f1, $f0 +; MIPS64-NEXT: movn.s $f1, $f12, $1 +; MIPS64-NEXT: mtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.s $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: minimum_float_nnan: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.s $f0, $f14 +; MIPS32R2-NEXT: c.ult.s $f12, $f14 +; MIPS32R2-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R2-NEXT: mfc1 $1, $f12 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.s $f1, $f0 +; MIPS32R2-NEXT: movn.s $f1, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f2 +; MIPS32R2-NEXT: c.eq.s $f0, $f2 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.s $f0, $f1, $fcc0 +; +; MIPS32-LABEL: minimum_float_nnan: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.s $f0, $f14 +; MIPS32-NEXT: c.ult.s $f12, $f14 +; MIPS32-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32-NEXT: mfc1 $1, $f12 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.s $f1, $f0 +; MIPS32-NEXT: movn.s $f1, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f2 +; MIPS32-NEXT: c.eq.s $f0, $f2 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.s $f0, $f1, $fcc0 +; MIPS32R5-LABEL: minimum_float_nnan: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.s $f0, $f14 +; MIPS32R5-NEXT: c.olt.s $f12, $f14 +; MIPS32R5-NEXT: movt.s $f0, $f12, $fcc0 +; MIPS32R5-NEXT: mfc1 $1, $f12 +; MIPS32R5-NEXT: lui $2, 32768 +; MIPS32R5-NEXT: xor $1, $1, $2 +; MIPS32R5-NEXT: mov.s $f1, $f0 +; MIPS32R5-NEXT: movz.s $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.s $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.s $f0, $f1, $fcc0 + %z = call nnan float @llvm.minimum.f32(float %x, float %y) + ret float %z +} + +define double @minimum_double(double %x, double %y) { +; MIPS32R6-LABEL: minimum_double: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.d $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI9_0) +; MIPS32R6-NEXT: ldc1 $f2, %lo($CPI9_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.d $f0, $f1, $f2 +; +; MIPS64R2-LABEL: minimum_double: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.un.d $f13, $f13 +; MIPS64R2-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ult.d $f12, $f13 +; MIPS64R2-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS64R2-NEXT: dmfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.d $f1, $f0 +; MIPS64R2-NEXT: movn.d $f1, $f12, $1 +; MIPS64R2-NEXT: dmtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.d $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS64-LABEL: minimum_double: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.un.d $f13, $f13 +; MIPS64-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ult.d $f12, $f13 +; MIPS64-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS64-NEXT: dmfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.d $f1, $f0 +; MIPS64-NEXT: movn.d $f1, $f12, $1 +; MIPS64-NEXT: dmtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.d $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: minimum_double: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.un.d $f14, $f14 +; MIPS32R2-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ult.d $f12, $f14 +; MIPS32R2-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R2-NEXT: cvt.s.d $f2, $f12 +; MIPS32R2-NEXT: mfc1 $1, $f2 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.d $f2, $f0 +; MIPS32R2-NEXT: movn.d $f2, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f4 +; MIPS32R2-NEXT: mthc1 $zero, $f4 +; MIPS32R2-NEXT: c.eq.d $f0, $f4 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.d $f0, $f2, $fcc0 +; +; MIPS32-LABEL: minimum_double: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.un.d $f14, $f14 +; MIPS32-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ult.d $f12, $f14 +; MIPS32-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32-NEXT: cvt.s.d $f2, $f12 +; MIPS32-NEXT: mfc1 $1, $f2 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.d $f2, $f0 +; MIPS32-NEXT: movn.d $f2, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f4 +; MIPS32-NEXT: mtc1 $zero, $f5 +; MIPS32-NEXT: c.eq.d $f0, $f4 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.d $f0, $f2, $fcc0 +; MIPS32R5-LABEL: minimum_double: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.un.d $f12, $f12 +; MIPS32R5-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.d $f14, $f14 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.olt.d $f12, $f0 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: cvt.s.d $f1, $f12 +; MIPS32R5-NEXT: mfc1 $1, $f1 +; MIPS32R5-NEXT: lui $2, 32768 +; MIPS32R5-NEXT: xor $1, $1, $2 +; MIPS32R5-NEXT: mov.d $f1, $f0 +; MIPS32R5-NEXT: movz.d $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: mthc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.d $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.d $f0, $f1, $fcc0 + %z = call double @llvm.minimum.f64(double %x, double %y) + ret double %z +} + +define double @minimum_double_nsz(double %x, double %y) { +; MIPS32R6-LABEL: minimum_double_nsz: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: min.d $f1, $f12, $f14 +; MIPS32R6-NEXT: cmp.un.d $f0, $f12, $f14 +; MIPS32R6-NEXT: lui $1, %hi($CPI10_0) +; MIPS32R6-NEXT: ldc1 $f2, %lo($CPI10_0)($1) +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: sel.d $f0, $f1, $f2 +; +; MIPS64R2-LABEL: minimum_double_nsz: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.un.d $f13, $f13 +; MIPS64R2-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64R2-NEXT: c.ult.d $f12, $f13 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.d $f0, $f12, $fcc0 +; +; MIPS64-LABEL: minimum_double_nsz: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.un.d $f13, $f13 +; MIPS64-NEXT: movt.d $f12, $f13, $fcc0 +; MIPS64-NEXT: c.ult.d $f12, $f13 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.d $f0, $f12, $fcc0 +; +; MIPS32R2-LABEL: minimum_double_nsz: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.un.d $f14, $f14 +; MIPS32R2-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R2-NEXT: c.ult.d $f12, $f14 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.d $f0, $f12, $fcc0 +; +; MIPS32-LABEL: minimum_double_nsz: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.un.d $f14, $f14 +; MIPS32-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32-NEXT: c.ult.d $f12, $f14 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-LABEL: minimum_double_nsz: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.un.d $f12, $f12 +; MIPS32R5-NEXT: movt.d $f12, $f14, $fcc0 +; MIPS32R5-NEXT: c.un.d $f14, $f14 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: c.olt.d $f12, $f0 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 + %z = call nsz double @llvm.minimum.f64(double %x, double %y) + ret double %z +} + +define double @minimum_double_nnan(double %x, double %y) { +; MIPS32R6-LABEL: minimum_double_nnan: +; MIPS32R6: # %bb.0: +; MIPS32R6-NEXT: jr $ra +; MIPS32R6-NEXT: min.d $f0, $f12, $f14 +; +; MIPS64R2-LABEL: minimum_double_nnan: +; MIPS64R2: # %bb.0: +; MIPS64R2-NEXT: mov.d $f0, $f13 +; MIPS64R2-NEXT: c.ult.d $f12, $f13 +; MIPS64R2-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS64R2-NEXT: dmfc1 $1, $f12 +; MIPS64R2-NEXT: slti $1, $1, 0 +; MIPS64R2-NEXT: mov.d $f1, $f0 +; MIPS64R2-NEXT: movn.d $f1, $f12, $1 +; MIPS64R2-NEXT: dmtc1 $zero, $f2 +; MIPS64R2-NEXT: c.eq.d $f0, $f2 +; MIPS64R2-NEXT: jr $ra +; MIPS64R2-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS64-LABEL: minimum_double_nnan: +; MIPS64: # %bb.0: +; MIPS64-NEXT: mov.d $f0, $f13 +; MIPS64-NEXT: c.ult.d $f12, $f13 +; MIPS64-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS64-NEXT: dmfc1 $1, $f12 +; MIPS64-NEXT: slti $1, $1, 0 +; MIPS64-NEXT: mov.d $f1, $f0 +; MIPS64-NEXT: movn.d $f1, $f12, $1 +; MIPS64-NEXT: dmtc1 $zero, $f2 +; MIPS64-NEXT: c.eq.d $f0, $f2 +; MIPS64-NEXT: jr $ra +; MIPS64-NEXT: movt.d $f0, $f1, $fcc0 +; +; MIPS32R2-LABEL: minimum_double_nnan: +; MIPS32R2: # %bb.0: +; MIPS32R2-NEXT: mov.d $f0, $f14 +; MIPS32R2-NEXT: c.ult.d $f12, $f14 +; MIPS32R2-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R2-NEXT: cvt.s.d $f2, $f12 +; MIPS32R2-NEXT: mfc1 $1, $f2 +; MIPS32R2-NEXT: slti $1, $1, 0 +; MIPS32R2-NEXT: mov.d $f2, $f0 +; MIPS32R2-NEXT: movn.d $f2, $f12, $1 +; MIPS32R2-NEXT: mtc1 $zero, $f4 +; MIPS32R2-NEXT: mthc1 $zero, $f4 +; MIPS32R2-NEXT: c.eq.d $f0, $f4 +; MIPS32R2-NEXT: jr $ra +; MIPS32R2-NEXT: movt.d $f0, $f2, $fcc0 +; +; MIPS32-LABEL: minimum_double_nnan: +; MIPS32: # %bb.0: +; MIPS32-NEXT: mov.d $f0, $f14 +; MIPS32-NEXT: c.ult.d $f12, $f14 +; MIPS32-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32-NEXT: cvt.s.d $f2, $f12 +; MIPS32-NEXT: mfc1 $1, $f2 +; MIPS32-NEXT: slti $1, $1, 0 +; MIPS32-NEXT: mov.d $f2, $f0 +; MIPS32-NEXT: movn.d $f2, $f12, $1 +; MIPS32-NEXT: mtc1 $zero, $f4 +; MIPS32-NEXT: mtc1 $zero, $f5 +; MIPS32-NEXT: c.eq.d $f0, $f4 +; MIPS32-NEXT: jr $ra +; MIPS32-NEXT: movt.d $f0, $f2, $fcc0 +; MIPS32R5-LABEL: minimum_double_nnan: +; MIPS32R5: # %bb.0: +; MIPS32R5-NEXT: mov.d $f0, $f14 +; MIPS32R5-NEXT: c.olt.d $f12, $f14 +; MIPS32R5-NEXT: movt.d $f0, $f12, $fcc0 +; MIPS32R5-NEXT: cvt.s.d $f1, $f12 +; MIPS32R5-NEXT: mfc1 $1, $f1 +; MIPS32R5-NEXT: lui $2, 32768 +; MIPS32R5-NEXT: xor $1, $1, $2 +; MIPS32R5-NEXT: mov.d $f1, $f0 +; MIPS32R5-NEXT: movz.d $f1, $f12, $1 +; MIPS32R5-NEXT: mtc1 $zero, $f2 +; MIPS32R5-NEXT: mthc1 $zero, $f2 +; MIPS32R5-NEXT: c.eq.d $f0, $f2 +; MIPS32R5-NEXT: jr $ra +; MIPS32R5-NEXT: movt.d $f0, $f1, $fcc0 + %z = call nnan double @llvm.minimum.f64(double %x, double %y) + ret double %z +} diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 41f77b5337e6d..979e47cd84f76 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -1558,8 +1558,8 @@ define bfloat @test_roundeven(bfloat %a) { define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM70-LABEL: test_maximum( ; SM70: { -; SM70-NEXT: .reg .pred %p<6>; -; SM70-NEXT: .reg .b16 %rs<8>; +; SM70-NEXT: .reg .pred %p<5>; +; SM70-NEXT: .reg .b16 %rs<7>; ; SM70-NEXT: .reg .b32 %r<7>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -1567,21 +1567,19 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) { ; SM70-NEXT: ld.param.b16 %rs2, [test_maximum_param_1]; ; SM70-NEXT: cvt.u32.u16 %r1, %rs2; ; SM70-NEXT: shl.b32 %r2, %r1, 16; -; SM70-NEXT: cvt.u32.u16 %r3, %rs1; +; SM70-NEXT: setp.nan.f32 %p1, %r2, %r2; +; SM70-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1; +; SM70-NEXT: cvt.u32.u16 %r3, %rs3; ; SM70-NEXT: shl.b32 %r4, %r3, 16; -; SM70-NEXT: setp.gt.f32 %p1, %r4, %r2; -; SM70-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; SM70-NEXT: setp.nan.f32 %p2, %r4, %r2; -; SM70-NEXT: selp.b16 %rs4, 0x7FC0, %rs3, %p2; -; SM70-NEXT: setp.eq.b16 %p3, %rs1, 0; -; SM70-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; -; SM70-NEXT: setp.eq.b16 %p4, %rs2, 0; -; SM70-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; SM70-NEXT: setp.gtu.f32 %p2, %r4, %r2; +; SM70-NEXT: selp.b16 %rs4, %rs3, %rs2, %p2; +; SM70-NEXT: setp.gt.s16 %p3, %rs3, -1; +; SM70-NEXT: selp.b16 %rs5, %rs3, %rs4, %p3; ; SM70-NEXT: cvt.u32.u16 %r5, %rs4; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: setp.eq.f32 %p5, %r6, 0f00000000; -; SM70-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; -; SM70-NEXT: st.param.b16 [func_retval0], %rs7; +; SM70-NEXT: setp.eq.f32 %p4, %r6, 0f00000000; +; SM70-NEXT: selp.b16 %rs6, %rs5, %rs4, %p4; +; SM70-NEXT: st.param.b16 [func_retval0], %rs6; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maximum( @@ -1703,46 +1701,44 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) { ; SM70-LABEL: test_maximum_v2( ; SM70: { -; SM70-NEXT: .reg .pred %p<11>; +; SM70-NEXT: .reg .pred %p<9>; ; SM70-NEXT: .reg .b16 %rs<15>; -; SM70-NEXT: .reg .b32 %r<13>; +; SM70-NEXT: .reg .b32 %r<14>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: ; SM70-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_maximum_v2_param_0]; ; SM70-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_maximum_v2_param_1]; ; SM70-NEXT: cvt.u32.u16 %r1, %rs4; ; SM70-NEXT: shl.b32 %r2, %r1, 16; -; SM70-NEXT: cvt.u32.u16 %r3, %rs2; +; SM70-NEXT: setp.nan.f32 %p1, %r2, %r2; +; SM70-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; SM70-NEXT: cvt.u32.u16 %r3, %rs5; ; SM70-NEXT: shl.b32 %r4, %r3, 16; -; SM70-NEXT: setp.gt.f32 %p1, %r4, %r2; -; SM70-NEXT: selp.b16 %rs5, %rs2, %rs4, %p1; -; SM70-NEXT: setp.nan.f32 %p2, %r4, %r2; -; SM70-NEXT: selp.b16 %rs6, 0x7FC0, %rs5, %p2; -; SM70-NEXT: setp.eq.b16 %p3, %rs2, 0; -; SM70-NEXT: selp.b16 %rs7, %rs2, %rs6, %p3; -; SM70-NEXT: setp.eq.b16 %p4, %rs4, 0; -; SM70-NEXT: selp.b16 %rs8, %rs4, %rs7, %p4; -; SM70-NEXT: cvt.u32.u16 %r5, %rs6; +; SM70-NEXT: setp.gtu.f32 %p2, %r4, %r2; +; SM70-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; SM70-NEXT: cvt.u32.u16 %r5, %rs3; ; SM70-NEXT: shl.b32 %r6, %r5, 16; -; SM70-NEXT: setp.eq.f32 %p5, %r6, 0f00000000; -; SM70-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; -; SM70-NEXT: cvt.u32.u16 %r7, %rs3; -; SM70-NEXT: shl.b32 %r8, %r7, 16; -; SM70-NEXT: cvt.u32.u16 %r9, %rs1; -; SM70-NEXT: shl.b32 %r10, %r9, 16; -; SM70-NEXT: setp.gt.f32 %p6, %r10, %r8; -; SM70-NEXT: selp.b16 %rs10, %rs1, %rs3, %p6; -; SM70-NEXT: setp.nan.f32 %p7, %r10, %r8; -; SM70-NEXT: selp.b16 %rs11, 0x7FC0, %rs10, %p7; -; SM70-NEXT: setp.eq.b16 %p8, %rs1, 0; -; SM70-NEXT: selp.b16 %rs12, %rs1, %rs11, %p8; -; SM70-NEXT: setp.eq.b16 %p9, %rs3, 0; -; SM70-NEXT: selp.b16 %rs13, %rs3, %rs12, %p9; -; SM70-NEXT: cvt.u32.u16 %r11, %rs11; -; SM70-NEXT: shl.b32 %r12, %r11, 16; -; SM70-NEXT: setp.eq.f32 %p10, %r12, 0f00000000; -; SM70-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; -; SM70-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs9}; +; SM70-NEXT: setp.nan.f32 %p3, %r6, %r6; +; SM70-NEXT: selp.b16 %rs7, %rs3, %rs1, %p3; +; SM70-NEXT: mov.b32 %r7, {%rs7, %rs5}; +; SM70-NEXT: mov.b32 {%rs8, %rs9}, %r7; +; SM70-NEXT: setp.gt.s16 %p4, %rs9, -1; +; SM70-NEXT: selp.b16 %rs10, %rs5, %rs6, %p4; +; SM70-NEXT: cvt.u32.u16 %r8, %rs6; +; SM70-NEXT: shl.b32 %r9, %r8, 16; +; SM70-NEXT: setp.eq.f32 %p5, %r9, 0f00000000; +; SM70-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; SM70-NEXT: cvt.u32.u16 %r10, %rs7; +; SM70-NEXT: shl.b32 %r11, %r10, 16; +; SM70-NEXT: setp.gtu.f32 %p6, %r11, %r6; +; SM70-NEXT: selp.b16 %rs12, %rs7, %rs3, %p6; +; SM70-NEXT: setp.gt.s16 %p7, %rs8, -1; +; SM70-NEXT: selp.b16 %rs13, %rs7, %rs12, %p7; +; SM70-NEXT: cvt.u32.u16 %r12, %rs12; +; SM70-NEXT: shl.b32 %r13, %r12, 16; +; SM70-NEXT: setp.eq.f32 %p8, %r13, 0f00000000; +; SM70-NEXT: selp.b16 %rs14, %rs13, %rs12, %p8; +; SM70-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs11}; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_maximum_v2( diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index 1ed296269c521..d41a890b0cd88 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -616,27 +616,25 @@ define <2 x half> @minnum_v2half(<2 x half> %a, <2 x half> %b) { define half @minimum_half(half %a, half %b) { ; CHECK-NOF16-LABEL: minimum_half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<5>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; ; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0]; ; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; -; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; -; CHECK-NOF16-NEXT: setp.eq.b16 %p3, %rs1, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs2, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs3; +; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %r2, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs4, %rs3, %rs2, %p2; +; CHECK-NOF16-NEXT: setp.lt.s16 %p3, %rs3, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs3, %rs4, %p3; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p4; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_half( @@ -652,27 +650,25 @@ define half @minimum_half(half %a, half %b) { ; ; CHECK-SM80-NOF16-LABEL: minimum_half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<8>; +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<5>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; ; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [minimum_half_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [minimum_half_param_1]; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p3, %rs1, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs2, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.ltu.f32 %p2, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, %rs3, %rs2, %p2; +; CHECK-SM80-NOF16-NEXT: setp.lt.s16 %p3, %rs3, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs3, %rs4, %p3; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p4; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.minimum.f16(half %a, half %b) ret half %x @@ -681,8 +677,8 @@ define half @minimum_half(half %a, half %b) { define float @minimum_float(float %a, float %b) { ; CHECK-NOF16-LABEL: minimum_float( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<5>; -; CHECK-NOF16-NEXT: .reg .b32 %r<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_float_param_0]; @@ -690,13 +686,7 @@ define float @minimum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r2; ; CHECK-NOF16-NEXT: min.f32 %r3, %r1, %r2; ; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2; -; CHECK-NOF16-NEXT: setp.eq.b32 %p3, %r2, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3; -; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r4, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float( @@ -727,19 +717,15 @@ define float @minimum_float(float %a, float %b) { define float @minimum_imm1(float %a) { ; CHECK-NOF16-LABEL: minimum_imm1( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<4>; -; CHECK-NOF16-NEXT: .reg .b32 %r<6>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm1_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2; -; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm1( @@ -768,19 +754,15 @@ define float @minimum_imm1(float %a) { define float @minimum_imm2(float %a) { ; CHECK-NOF16-LABEL: minimum_imm2( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<4>; -; CHECK-NOF16-NEXT: .reg .b32 %r<6>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_imm2_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: min.f32 %r2, %r1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r4, %r1, %r3, %p2; -; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r4, %r3, %p3; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r5; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm2( @@ -809,8 +791,8 @@ define float @minimum_imm2(float %a) { define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-LABEL: minimum_float_ftz( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<5>; -; CHECK-NOF16-NEXT: .reg .b32 %r<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [minimum_float_ftz_param_0]; @@ -818,13 +800,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r1, %r2; ; CHECK-NOF16-NEXT: min.ftz.f32 %r3, %r1, %r2; ; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2; -; CHECK-NOF16-NEXT: setp.eq.b32 %p3, %r2, -2147483648; -; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3; -; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r4, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float_ftz( @@ -855,8 +831,8 @@ define float @minimum_float_ftz(float %a, float %b) #1 { define double @minimum_double(double %a, double %b) { ; CHECK-LABEL: minimum_double( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [minimum_double_param_0]; @@ -864,13 +840,7 @@ define double @minimum_double(double %a, double %b) { ; CHECK-NEXT: setp.nan.f64 %p1, %rd1, %rd2; ; CHECK-NEXT: min.f64 %rd3, %rd1, %rd2; ; CHECK-NEXT: selp.f64 %rd4, 0d7FF8000000000000, %rd3, %p1; -; CHECK-NEXT: setp.eq.b64 %p2, %rd1, -9223372036854775808; -; CHECK-NEXT: selp.f64 %rd5, %rd1, %rd4, %p2; -; CHECK-NEXT: setp.eq.b64 %p3, %rd2, -9223372036854775808; -; CHECK-NEXT: selp.f64 %rd6, %rd2, %rd5, %p3; -; CHECK-NEXT: setp.eq.f64 %p4, %rd4, 0d0000000000000000; -; CHECK-NEXT: selp.f64 %rd7, %rd6, %rd4, %p4; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %x = call double @llvm.minimum.f64(double %a, double %b) ret double %x @@ -879,40 +849,38 @@ define double @minimum_double(double %a, double %b) { define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: minimum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<11>; +; CHECK-NOF16-NEXT: .reg .pred %p<9>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<15>; -; CHECK-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimum_v2half_param_0]; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimum_v2half_param_1]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; -; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; -; CHECK-NOF16-NEXT: setp.eq.b16 %p3, %rs2, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs2, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs4, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs7, %p4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs6; -; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: setp.lt.f32 %p6, %r5, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs3, %p6; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r5, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; -; CHECK-NOF16-NEXT: setp.eq.b16 %p8, %rs1, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs1, %rs11, %p8; -; CHECK-NOF16-NEXT: setp.eq.b16 %p9, %rs3, -32768; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs3, %rs12, %p9; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs11; -; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %r6, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs9}; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %r2, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p3, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs3, %rs1, %p3; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs7, %rs5}; +; CHECK-NOF16-NEXT: mov.b32 {%rs8, %rs9}, %r4; +; CHECK-NOF16-NEXT: setp.lt.s16 %p4, %rs9, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs5, %rs6, %p4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NOF16-NEXT: setp.ltu.f32 %p6, %r6, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs7, %rs3, %p6; +; CHECK-NOF16-NEXT: setp.lt.s16 %p7, %rs8, 0; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs7, %rs12, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs12; +; CHECK-NOF16-NEXT: setp.eq.f32 %p8, %r7, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs12, %p8; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs11}; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_v2half( @@ -928,40 +896,38 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: minimum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<11>; +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<9>; ; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<15>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [minimum_v2half_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [minimum_v2half_param_1]; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p1, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p3, %rs2, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs2, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs4, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.lt.f32 %p6, %r5, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs3, %p6; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r5, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p8, %rs1, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs1, %rs11, %p8; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p9, %rs3, -32768; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs3, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs11; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %r6, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs9}; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: setp.ltu.f32 %p2, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p3, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs3, %rs1, %p3; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r4, {%rs7, %rs5}; +; CHECK-SM80-NOF16-NEXT: mov.b32 {%rs8, %rs9}, %r4; +; CHECK-SM80-NOF16-NEXT: setp.lt.s16 %p4, %rs9, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs5, %rs6, %p4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.ltu.f32 %p6, %r6, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs7, %rs3, %p6; +; CHECK-SM80-NOF16-NEXT: setp.lt.s16 %p7, %rs8, 0; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs7, %rs12, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p8, %r7, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs12, %p8; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs11}; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x @@ -1147,27 +1113,25 @@ define <2 x half> @maxnum_v2half(<2 x half> %a, <2 x half> %b) { define half @maximum_half(half %a, half %b) { ; CHECK-NOF16-LABEL: maximum_half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-NOF16-NEXT: .reg .b16 %rs<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<5>; +; CHECK-NOF16-NEXT: .reg .b16 %rs<7>; ; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b16 %rs1, [maximum_half_param_0]; ; CHECK-NOF16-NEXT: ld.param.b16 %rs2, [maximum_half_param_1]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; -; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; -; CHECK-NOF16-NEXT: setp.eq.b16 %p3, %rs1, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs2, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs3; +; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %r2, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs4, %rs3, %rs2, %p2; +; CHECK-NOF16-NEXT: setp.gt.s16 %p3, %rs3, -1; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs3, %rs4, %p3; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; -; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p4; +; CHECK-NOF16-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_half( @@ -1183,27 +1147,25 @@ define half @maximum_half(half %a, half %b) { ; ; CHECK-SM80-NOF16-LABEL: maximum_half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<6>; -; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<8>; +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<5>; +; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<7>; ; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs1, [maximum_half_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.b16 %rs2, [maximum_half_param_1]; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs2; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p1, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, 0x7E00, %rs3, %p2; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p3, %rs1, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs1, %rs4, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs2, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs2, %rs5, %p4; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs3, %rs2, %rs1, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.gtu.f32 %p2, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs4, %rs3, %rs2, %p2; +; CHECK-SM80-NOF16-NEXT: setp.gt.s16 %p3, %rs3, -1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs3, %rs4, %p3; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs6, %rs4, %p5; -; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p4, %r3, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p4; +; CHECK-SM80-NOF16-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-SM80-NOF16-NEXT: ret; %x = call half @llvm.maximum.f16(half %a, half %b) ret half %x @@ -1212,17 +1174,15 @@ define half @maximum_half(half %a, half %b) { define float @maximum_imm1(float %a) { ; CHECK-NOF16-LABEL: maximum_imm1( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<3>; -; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_imm1_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: max.f32 %r2, %r1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1; -; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r4, 0f00000000, %r3, %p2; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm1( @@ -1251,17 +1211,15 @@ define float @maximum_imm1(float %a) { define float @maximum_imm2(float %a) { ; CHECK-NOF16-LABEL: maximum_imm2( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<3>; -; CHECK-NOF16-NEXT: .reg .b32 %r<5>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<4>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_imm2_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; ; CHECK-NOF16-NEXT: max.f32 %r2, %r1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %r3, 0f7FC00000, %r2, %p1; -; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r4, 0f00000000, %r3, %p2; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm2( @@ -1290,8 +1248,8 @@ define float @maximum_imm2(float %a) { define float @maximum_float(float %a, float %b) { ; CHECK-NOF16-LABEL: maximum_float( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<5>; -; CHECK-NOF16-NEXT: .reg .b32 %r<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_float_param_0]; @@ -1299,13 +1257,7 @@ define float @maximum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r2; ; CHECK-NOF16-NEXT: max.f32 %r3, %r1, %r2; ; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, 0; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2; -; CHECK-NOF16-NEXT: setp.eq.b32 %p3, %r2, 0; -; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3; -; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %r4, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float( @@ -1336,8 +1288,8 @@ define float @maximum_float(float %a, float %b) { define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-LABEL: maximum_float_ftz( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<5>; -; CHECK-NOF16-NEXT: .reg .b32 %r<8>; +; CHECK-NOF16-NEXT: .reg .pred %p<2>; +; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [maximum_float_ftz_param_0]; @@ -1345,13 +1297,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %r1, %r2; ; CHECK-NOF16-NEXT: max.ftz.f32 %r3, %r1, %r2; ; CHECK-NOF16-NEXT: selp.f32 %r4, 0f7FC00000, %r3, %p1; -; CHECK-NOF16-NEXT: setp.eq.b32 %p2, %r1, 0; -; CHECK-NOF16-NEXT: selp.f32 %r5, %r1, %r4, %p2; -; CHECK-NOF16-NEXT: setp.eq.b32 %p3, %r2, 0; -; CHECK-NOF16-NEXT: selp.f32 %r6, %r2, %r5, %p3; -; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %r4, 0f00000000; -; CHECK-NOF16-NEXT: selp.f32 %r7, %r6, %r4, %p4; -; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float_ftz( @@ -1382,8 +1328,8 @@ define float @maximum_float_ftz(float %a, float %b) #1 { define double @maximum_double(double %a, double %b) { ; CHECK-LABEL: maximum_double( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [maximum_double_param_0]; @@ -1391,13 +1337,7 @@ define double @maximum_double(double %a, double %b) { ; CHECK-NEXT: setp.nan.f64 %p1, %rd1, %rd2; ; CHECK-NEXT: max.f64 %rd3, %rd1, %rd2; ; CHECK-NEXT: selp.f64 %rd4, 0d7FF8000000000000, %rd3, %p1; -; CHECK-NEXT: setp.eq.b64 %p2, %rd1, 0; -; CHECK-NEXT: selp.f64 %rd5, %rd1, %rd4, %p2; -; CHECK-NEXT: setp.eq.b64 %p3, %rd2, 0; -; CHECK-NEXT: selp.f64 %rd6, %rd2, %rd5, %p3; -; CHECK-NEXT: setp.eq.f64 %p4, %rd4, 0d0000000000000000; -; CHECK-NEXT: selp.f64 %rd7, %rd6, %rd4, %p4; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; %x = call double @llvm.maximum.f64(double %a, double %b) ret double %x @@ -1406,40 +1346,38 @@ define double @maximum_double(double %a, double %b) { define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; CHECK-NOF16-LABEL: maximum_v2half( ; CHECK-NOF16: { -; CHECK-NOF16-NEXT: .reg .pred %p<11>; +; CHECK-NOF16-NEXT: .reg .pred %p<9>; ; CHECK-NOF16-NEXT: .reg .b16 %rs<15>; -; CHECK-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximum_v2half_param_0]; ; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximum_v2half_param_1]; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; -; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p1; -; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; -; CHECK-NOF16-NEXT: setp.eq.b16 %p3, %rs2, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs2, %rs6, %p3; -; CHECK-NOF16-NEXT: setp.eq.b16 %p4, %rs4, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs7, %p4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs6; -; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: setp.gt.f32 %p6, %r5, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs3, %p6; -; CHECK-NOF16-NEXT: setp.nan.f32 %p7, %r5, %r4; -; CHECK-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; -; CHECK-NOF16-NEXT: setp.eq.b16 %p8, %rs1, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs1, %rs11, %p8; -; CHECK-NOF16-NEXT: setp.eq.b16 %p9, %rs3, 0; -; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs3, %rs12, %p9; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs11; -; CHECK-NOF16-NEXT: setp.eq.f32 %p10, %r6, 0f00000000; -; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; -; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs9}; +; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %r2, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs3; +; CHECK-NOF16-NEXT: setp.nan.f32 %p3, %r3, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs7, %rs3, %rs1, %p3; +; CHECK-NOF16-NEXT: mov.b32 %r4, {%rs7, %rs5}; +; CHECK-NOF16-NEXT: mov.b32 {%rs8, %rs9}, %r4; +; CHECK-NOF16-NEXT: setp.gt.s16 %p4, %rs9, -1; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs5, %rs6, %p4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-NOF16-NEXT: setp.eq.f32 %p5, %r5, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-NOF16-NEXT: setp.gtu.f32 %p6, %r6, %r3; +; CHECK-NOF16-NEXT: selp.b16 %rs12, %rs7, %rs3, %p6; +; CHECK-NOF16-NEXT: setp.gt.s16 %p7, %rs8, -1; +; CHECK-NOF16-NEXT: selp.b16 %rs13, %rs7, %rs12, %p7; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs12; +; CHECK-NOF16-NEXT: setp.eq.f32 %p8, %r7, 0f00000000; +; CHECK-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs12, %p8; +; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs11}; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_v2half( @@ -1455,40 +1393,38 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) { ; ; CHECK-SM80-NOF16-LABEL: maximum_v2half( ; CHECK-SM80-NOF16: { -; CHECK-SM80-NOF16-NEXT: .reg .pred %p<11>; +; CHECK-SM80-NOF16-NEXT: .reg .pred %p<9>; ; CHECK-SM80-NOF16-NEXT: .reg .b16 %rs<15>; -; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<7>; +; CHECK-SM80-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [maximum_v2half_param_0]; ; CHECK-SM80-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [maximum_v2half_param_1]; ; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r1, %rs4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p1, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p1; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p2, %r2, %r1; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, 0x7E00, %rs5, %p2; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p3, %rs2, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs2, %rs6, %p3; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p4, %rs4, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs8, %rs4, %rs7, %p4; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs6; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r3, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p5; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-SM80-NOF16-NEXT: setp.gt.f32 %p6, %r5, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs3, %p6; -; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p7, %r5, %r4; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, 0x7E00, %rs10, %p7; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p8, %rs1, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs1, %rs11, %p8; -; CHECK-SM80-NOF16-NEXT: setp.eq.b16 %p9, %rs3, 0; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs3, %rs12, %p9; -; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs11; -; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p10, %r6, 0f00000000; -; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs11, %p10; -; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs9}; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p1, %r1, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p1; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r2, %rs5; +; CHECK-SM80-NOF16-NEXT: setp.gtu.f32 %p2, %r2, %r1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs6, %rs5, %rs4, %p2; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r3, %rs3; +; CHECK-SM80-NOF16-NEXT: setp.nan.f32 %p3, %r3, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs7, %rs3, %rs1, %p3; +; CHECK-SM80-NOF16-NEXT: mov.b32 %r4, {%rs7, %rs5}; +; CHECK-SM80-NOF16-NEXT: mov.b32 {%rs8, %rs9}, %r4; +; CHECK-SM80-NOF16-NEXT: setp.gt.s16 %p4, %rs9, -1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs10, %rs5, %rs6, %p4; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p5, %r5, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs11, %rs10, %rs6, %p5; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r6, %rs7; +; CHECK-SM80-NOF16-NEXT: setp.gtu.f32 %p6, %r6, %r3; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs12, %rs7, %rs3, %p6; +; CHECK-SM80-NOF16-NEXT: setp.gt.s16 %p7, %rs8, -1; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs13, %rs7, %rs12, %p7; +; CHECK-SM80-NOF16-NEXT: cvt.f32.f16 %r7, %rs12; +; CHECK-SM80-NOF16-NEXT: setp.eq.f32 %p8, %r7, 0f00000000; +; CHECK-SM80-NOF16-NEXT: selp.b16 %rs14, %rs13, %rs12, %p8; +; CHECK-SM80-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs14, %rs11}; ; CHECK-SM80-NOF16-NEXT: ret; %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %x diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll index 6d9eb13376827..17b7f42d8c3ce 100644 --- a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll +++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum-f128.ll @@ -4,44 +4,37 @@ define fp128 @f128_minimum(fp128 %a, fp128 %b) { ; CHECK-LABEL: f128_minimum: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xscmpuqp 0, 2, 3 +; CHECK-NEXT: xscmpuqp 0, 3, 3 ; CHECK-NEXT: vmr 4, 2 -; CHECK-NEXT: bge 0, .LBB0_8 +; CHECK-NEXT: vmr 2, 3 +; CHECK-NEXT: bun 0, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bun 0, .LBB0_9 +; CHECK-NEXT: vmr 2, 4 ; CHECK-NEXT: .LBB0_2: # %entry -; CHECK-NEXT: xststdcqp 0, 2, 4 -; CHECK-NEXT: bc 4, 2, .LBB0_10 -; CHECK-NEXT: .LBB0_3: # %entry -; CHECK-NEXT: xststdcqp 0, 3, 4 -; CHECK-NEXT: bc 12, 2, .LBB0_5 -; CHECK-NEXT: .LBB0_4: # %entry -; CHECK-NEXT: vmr 3, 2 -; CHECK-NEXT: .LBB0_5: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l -; CHECK-NEXT: lxv 34, 0(3) -; CHECK-NEXT: xscmpuqp 0, 4, 2 -; CHECK-NEXT: beq 0, .LBB0_7 -; CHECK-NEXT: # %bb.6: # %entry -; CHECK-NEXT: vmr 3, 4 -; CHECK-NEXT: .LBB0_7: # %entry -; CHECK-NEXT: vmr 2, 3 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB0_8: # %entry +; CHECK-NEXT: xscmpuqp 0, 2, 3 +; CHECK-NEXT: vmr 4, 2 +; CHECK-NEXT: cror 20, 0, 3 +; CHECK-NEXT: bc 12, 20, .LBB0_4 +; CHECK-NEXT: # %bb.3: # %entry ; CHECK-NEXT: vmr 4, 3 -; CHECK-NEXT: bnu 0, .LBB0_2 -; CHECK-NEXT: .LBB0_9: +; CHECK-NEXT: .LBB0_4: # %entry +; CHECK-NEXT: xscvqpdpo 3, 2 +; CHECK-NEXT: xsrsp 0, 35 +; CHECK-NEXT: xscvdpspn 0, 0 +; CHECK-NEXT: mffprwz 3, 0 +; CHECK-NEXT: cmpwi 3, 0 +; CHECK-NEXT: blt 0, .LBB0_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: vmr 2, 4 +; CHECK-NEXT: .LBB0_6: # %entry ; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l -; CHECK-NEXT: lxv 36, 0(3) -; CHECK-NEXT: xststdcqp 0, 2, 4 -; CHECK-NEXT: bc 12, 2, .LBB0_3 -; CHECK-NEXT: .LBB0_10: # %entry +; CHECK-NEXT: lxv 35, 0(3) +; CHECK-NEXT: xscmpuqp 0, 4, 3 +; CHECK-NEXT: beqlr 0 +; CHECK-NEXT: # %bb.7: # %entry ; CHECK-NEXT: vmr 2, 4 -; CHECK-NEXT: xststdcqp 0, 3, 4 -; CHECK-NEXT: bc 4, 2, .LBB0_4 -; CHECK-NEXT: b .LBB0_5 +; CHECK-NEXT: blr entry: %m = call fp128 @llvm.minimum.f128(fp128 %a, fp128 %b) ret fp128 %m @@ -50,44 +43,37 @@ entry: define fp128 @f128_maximum(fp128 %a, fp128 %b) { ; CHECK-LABEL: f128_maximum: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xscmpuqp 0, 2, 3 +; CHECK-NEXT: xscmpuqp 0, 3, 3 ; CHECK-NEXT: vmr 4, 2 -; CHECK-NEXT: ble 0, .LBB1_8 +; CHECK-NEXT: vmr 2, 3 +; CHECK-NEXT: bun 0, .LBB1_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bun 0, .LBB1_9 +; CHECK-NEXT: vmr 2, 4 ; CHECK-NEXT: .LBB1_2: # %entry -; CHECK-NEXT: xststdcqp 0, 2, 8 -; CHECK-NEXT: bc 4, 2, .LBB1_10 -; CHECK-NEXT: .LBB1_3: # %entry -; CHECK-NEXT: xststdcqp 0, 3, 8 -; CHECK-NEXT: bc 12, 2, .LBB1_5 -; CHECK-NEXT: .LBB1_4: # %entry -; CHECK-NEXT: vmr 3, 2 -; CHECK-NEXT: .LBB1_5: # %entry -; CHECK-NEXT: addis 3, 2, .LCPI1_1@toc@ha -; CHECK-NEXT: addi 3, 3, .LCPI1_1@toc@l -; CHECK-NEXT: lxv 34, 0(3) -; CHECK-NEXT: xscmpuqp 0, 4, 2 -; CHECK-NEXT: beq 0, .LBB1_7 -; CHECK-NEXT: # %bb.6: # %entry -; CHECK-NEXT: vmr 3, 4 -; CHECK-NEXT: .LBB1_7: # %entry -; CHECK-NEXT: vmr 2, 3 -; CHECK-NEXT: blr -; CHECK-NEXT: .LBB1_8: # %entry +; CHECK-NEXT: xscmpuqp 0, 2, 3 +; CHECK-NEXT: vmr 4, 2 +; CHECK-NEXT: cror 20, 1, 3 +; CHECK-NEXT: bc 12, 20, .LBB1_4 +; CHECK-NEXT: # %bb.3: # %entry ; CHECK-NEXT: vmr 4, 3 -; CHECK-NEXT: bnu 0, .LBB1_2 -; CHECK-NEXT: .LBB1_9: +; CHECK-NEXT: .LBB1_4: # %entry +; CHECK-NEXT: xscvqpdpo 3, 2 +; CHECK-NEXT: xsrsp 0, 35 +; CHECK-NEXT: xscvdpspn 0, 0 +; CHECK-NEXT: mffprwz 3, 0 +; CHECK-NEXT: cmpwi 3, -1 +; CHECK-NEXT: bgt 0, .LBB1_6 +; CHECK-NEXT: # %bb.5: # %entry +; CHECK-NEXT: vmr 2, 4 +; CHECK-NEXT: .LBB1_6: # %entry ; CHECK-NEXT: addis 3, 2, .LCPI1_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI1_0@toc@l -; CHECK-NEXT: lxv 36, 0(3) -; CHECK-NEXT: xststdcqp 0, 2, 8 -; CHECK-NEXT: bc 12, 2, .LBB1_3 -; CHECK-NEXT: .LBB1_10: # %entry +; CHECK-NEXT: lxv 35, 0(3) +; CHECK-NEXT: xscmpuqp 0, 4, 3 +; CHECK-NEXT: beqlr 0 +; CHECK-NEXT: # %bb.7: # %entry ; CHECK-NEXT: vmr 2, 4 -; CHECK-NEXT: xststdcqp 0, 3, 8 -; CHECK-NEXT: bc 4, 2, .LBB1_4 -; CHECK-NEXT: b .LBB1_5 +; CHECK-NEXT: blr entry: %m = call fp128 @llvm.maximum.f128(fp128 %a, fp128 %b) ret fp128 %m diff --git a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll index 39cf136e10d77..209daa090019e 100644 --- a/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/PowerPC/fminimum-fmaximum.ll @@ -6,41 +6,31 @@ define float @f32_minimum(float %a, float %b) { ; NOVSX-LABEL: f32_minimum: ; NOVSX: # %bb.0: # %entry +; NOVSX-NEXT: fcmpu 0, 2, 2 +; NOVSX-NEXT: fmr 0, 1 +; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: bc 12, 3, .LBB0_2 +; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: .LBB0_2: # %entry ; NOVSX-NEXT: fcmpu 0, 1, 2 ; NOVSX-NEXT: fmr 0, 1 -; NOVSX-NEXT: stfs 2, -8(1) ; NOVSX-NEXT: stfs 1, -4(1) -; NOVSX-NEXT: bc 12, 0, .LBB0_2 -; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: cror 20, 0, 3 +; NOVSX-NEXT: bc 12, 20, .LBB0_4 +; NOVSX-NEXT: # %bb.3: # %entry ; NOVSX-NEXT: fmr 0, 2 -; NOVSX-NEXT: .LBB0_2: # %entry -; NOVSX-NEXT: lwz 3, -4(1) -; NOVSX-NEXT: bc 4, 3, .LBB0_4 -; NOVSX-NEXT: # %bb.3: -; NOVSX-NEXT: addis 4, 2, .LCPI0_0@toc@ha -; NOVSX-NEXT: lfs 0, .LCPI0_0@toc@l(4) ; NOVSX-NEXT: .LBB0_4: # %entry -; NOVSX-NEXT: xoris 3, 3, 32768 -; NOVSX-NEXT: lwz 4, -8(1) -; NOVSX-NEXT: cmplwi 3, 0 -; NOVSX-NEXT: bc 12, 2, .LBB0_6 +; NOVSX-NEXT: addis 3, 2, .LCPI0_0@toc@ha +; NOVSX-NEXT: lfs 2, .LCPI0_0@toc@l(3) +; NOVSX-NEXT: lwz 3, -4(1) +; NOVSX-NEXT: fcmpu 0, 0, 2 +; NOVSX-NEXT: bc 4, 2, .LBB0_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: cmpwi 3, 0 +; NOVSX-NEXT: bclr 12, 0, 0 ; NOVSX-NEXT: .LBB0_6: # %entry -; NOVSX-NEXT: xoris 3, 4, 32768 -; NOVSX-NEXT: cmplwi 3, 0 -; NOVSX-NEXT: bc 12, 2, .LBB0_8 -; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 2, 1 -; NOVSX-NEXT: .LBB0_8: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI0_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI0_1@toc@l(3) -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB0_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: .LBB0_10: # %entry -; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: fmr 1, 0 ; NOVSX-NEXT: blr ; ; VSX-LABEL: f32_minimum: @@ -74,39 +64,31 @@ entry: define float @f32_maximum(float %a, float %b) { ; NOVSX-LABEL: f32_maximum: ; NOVSX: # %bb.0: # %entry +; NOVSX-NEXT: fcmpu 0, 2, 2 +; NOVSX-NEXT: fmr 0, 1 +; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: bc 12, 3, .LBB1_2 +; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: .LBB1_2: # %entry ; NOVSX-NEXT: fcmpu 0, 1, 2 ; NOVSX-NEXT: fmr 0, 1 -; NOVSX-NEXT: stfs 2, -8(1) ; NOVSX-NEXT: stfs 1, -4(1) -; NOVSX-NEXT: bc 12, 1, .LBB1_2 -; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: cror 20, 1, 3 +; NOVSX-NEXT: bc 12, 20, .LBB1_4 +; NOVSX-NEXT: # %bb.3: # %entry ; NOVSX-NEXT: fmr 0, 2 -; NOVSX-NEXT: .LBB1_2: # %entry -; NOVSX-NEXT: lwz 3, -4(1) -; NOVSX-NEXT: bc 4, 3, .LBB1_4 -; NOVSX-NEXT: # %bb.3: -; NOVSX-NEXT: addis 4, 2, .LCPI1_0@toc@ha -; NOVSX-NEXT: lfs 0, .LCPI1_0@toc@l(4) ; NOVSX-NEXT: .LBB1_4: # %entry -; NOVSX-NEXT: cmpwi 3, 0 -; NOVSX-NEXT: lwz 4, -8(1) -; NOVSX-NEXT: bc 12, 2, .LBB1_6 +; NOVSX-NEXT: addis 3, 2, .LCPI1_0@toc@ha +; NOVSX-NEXT: lfs 2, .LCPI1_0@toc@l(3) +; NOVSX-NEXT: lwz 3, -4(1) +; NOVSX-NEXT: fcmpu 0, 0, 2 +; NOVSX-NEXT: bc 4, 2, .LBB1_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: cmpwi 3, -1 +; NOVSX-NEXT: bclr 12, 1, 0 ; NOVSX-NEXT: .LBB1_6: # %entry -; NOVSX-NEXT: cmpwi 4, 0 -; NOVSX-NEXT: bc 12, 2, .LBB1_8 -; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 2, 1 -; NOVSX-NEXT: .LBB1_8: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI1_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI1_1@toc@l(3) -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB1_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: .LBB1_10: # %entry -; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: fmr 1, 0 ; NOVSX-NEXT: blr ; ; VSX-LABEL: f32_maximum: @@ -140,41 +122,31 @@ entry: define double @f64_minimum(double %a, double %b) { ; NOVSX-LABEL: f64_minimum: ; NOVSX: # %bb.0: # %entry +; NOVSX-NEXT: fcmpu 0, 2, 2 +; NOVSX-NEXT: fmr 0, 1 +; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: bc 12, 3, .LBB2_2 +; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: .LBB2_2: # %entry ; NOVSX-NEXT: fcmpu 0, 1, 2 ; NOVSX-NEXT: fmr 0, 1 -; NOVSX-NEXT: stfd 2, -16(1) ; NOVSX-NEXT: stfd 1, -8(1) -; NOVSX-NEXT: bc 12, 0, .LBB2_2 -; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: cror 20, 0, 3 +; NOVSX-NEXT: bc 12, 20, .LBB2_4 +; NOVSX-NEXT: # %bb.3: # %entry ; NOVSX-NEXT: fmr 0, 2 -; NOVSX-NEXT: .LBB2_2: # %entry -; NOVSX-NEXT: ld 3, -8(1) -; NOVSX-NEXT: bc 4, 3, .LBB2_4 -; NOVSX-NEXT: # %bb.3: -; NOVSX-NEXT: addis 4, 2, .LCPI2_0@toc@ha -; NOVSX-NEXT: lfs 0, .LCPI2_0@toc@l(4) ; NOVSX-NEXT: .LBB2_4: # %entry -; NOVSX-NEXT: li 5, 1 -; NOVSX-NEXT: ld 4, -16(1) -; NOVSX-NEXT: rldic 5, 5, 63, 0 -; NOVSX-NEXT: cmpd 3, 5 -; NOVSX-NEXT: bc 12, 2, .LBB2_6 +; NOVSX-NEXT: addis 3, 2, .LCPI2_0@toc@ha +; NOVSX-NEXT: lfs 2, .LCPI2_0@toc@l(3) +; NOVSX-NEXT: ld 3, -8(1) +; NOVSX-NEXT: fcmpu 0, 0, 2 +; NOVSX-NEXT: bc 4, 2, .LBB2_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: cmpdi 3, 0 +; NOVSX-NEXT: bclr 12, 0, 0 ; NOVSX-NEXT: .LBB2_6: # %entry -; NOVSX-NEXT: cmpd 4, 5 -; NOVSX-NEXT: bc 12, 2, .LBB2_8 -; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 2, 1 -; NOVSX-NEXT: .LBB2_8: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI2_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI2_1@toc@l(3) -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB2_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: .LBB2_10: # %entry -; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: fmr 1, 0 ; NOVSX-NEXT: blr ; ; VSX-LABEL: f64_minimum: @@ -208,39 +180,31 @@ entry: define double @f64_maximum(double %a, double %b) { ; NOVSX-LABEL: f64_maximum: ; NOVSX: # %bb.0: # %entry +; NOVSX-NEXT: fcmpu 0, 2, 2 +; NOVSX-NEXT: fmr 0, 1 +; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: bc 12, 3, .LBB3_2 +; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: .LBB3_2: # %entry ; NOVSX-NEXT: fcmpu 0, 1, 2 ; NOVSX-NEXT: fmr 0, 1 -; NOVSX-NEXT: stfd 2, -16(1) ; NOVSX-NEXT: stfd 1, -8(1) -; NOVSX-NEXT: bc 12, 1, .LBB3_2 -; NOVSX-NEXT: # %bb.1: # %entry +; NOVSX-NEXT: cror 20, 1, 3 +; NOVSX-NEXT: bc 12, 20, .LBB3_4 +; NOVSX-NEXT: # %bb.3: # %entry ; NOVSX-NEXT: fmr 0, 2 -; NOVSX-NEXT: .LBB3_2: # %entry -; NOVSX-NEXT: ld 3, -8(1) -; NOVSX-NEXT: bc 4, 3, .LBB3_4 -; NOVSX-NEXT: # %bb.3: -; NOVSX-NEXT: addis 4, 2, .LCPI3_0@toc@ha -; NOVSX-NEXT: lfs 0, .LCPI3_0@toc@l(4) ; NOVSX-NEXT: .LBB3_4: # %entry -; NOVSX-NEXT: cmpdi 3, 0 -; NOVSX-NEXT: ld 4, -16(1) -; NOVSX-NEXT: bc 12, 2, .LBB3_6 +; NOVSX-NEXT: addis 3, 2, .LCPI3_0@toc@ha +; NOVSX-NEXT: lfs 2, .LCPI3_0@toc@l(3) +; NOVSX-NEXT: ld 3, -8(1) +; NOVSX-NEXT: fcmpu 0, 0, 2 +; NOVSX-NEXT: bc 4, 2, .LBB3_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 0 +; NOVSX-NEXT: cmpdi 3, -1 +; NOVSX-NEXT: bclr 12, 1, 0 ; NOVSX-NEXT: .LBB3_6: # %entry -; NOVSX-NEXT: cmpdi 4, 0 -; NOVSX-NEXT: bc 12, 2, .LBB3_8 -; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 2, 1 -; NOVSX-NEXT: .LBB3_8: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI3_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI3_1@toc@l(3) -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB3_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: .LBB3_10: # %entry -; NOVSX-NEXT: fmr 1, 2 +; NOVSX-NEXT: fmr 1, 0 ; NOVSX-NEXT: blr ; ; VSX-LABEL: f64_maximum: @@ -274,25 +238,39 @@ entry: define <4 x float> @v4f32_minimum(<4 x float> %a, <4 x float> %b) { ; NOVSX-LABEL: v4f32_minimum: ; NOVSX: # %bb.0: # %entry -; NOVSX-NEXT: vcmpeqfp 0, 3, 3 -; NOVSX-NEXT: vcmpeqfp 1, 2, 2 -; NOVSX-NEXT: addis 3, 2, .LCPI4_0@toc@ha -; NOVSX-NEXT: addi 3, 3, .LCPI4_0@toc@l -; NOVSX-NEXT: vnot 0, 0 -; NOVSX-NEXT: vnot 1, 1 -; NOVSX-NEXT: vspltisb 4, -1 -; NOVSX-NEXT: vcmpgtfp 5, 3, 2 -; NOVSX-NEXT: vslw 4, 4, 4 -; NOVSX-NEXT: vor 0, 1, 0 -; NOVSX-NEXT: lvx 1, 0, 3 -; NOVSX-NEXT: vsel 5, 3, 2, 5 -; NOVSX-NEXT: vsel 5, 5, 1, 0 -; NOVSX-NEXT: vcmpequw 0, 2, 4 -; NOVSX-NEXT: vcmpequw 4, 3, 4 -; NOVSX-NEXT: vsel 2, 5, 2, 0 +; NOVSX-NEXT: vcmpeqfp 4, 3, 3 +; NOVSX-NEXT: addi 3, 1, -48 +; NOVSX-NEXT: vnot 4, 4 +; NOVSX-NEXT: stvx 3, 0, 3 +; NOVSX-NEXT: addi 3, 1, -32 ; NOVSX-NEXT: vsel 2, 2, 3, 4 ; NOVSX-NEXT: vxor 3, 3, 3 +; NOVSX-NEXT: stvx 2, 0, 3 +; NOVSX-NEXT: vcmpgtsw 4, 3, 2 +; NOVSX-NEXT: lwz 3, -36(1) +; NOVSX-NEXT: lwz 4, -20(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: isellt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -24(1) +; NOVSX-NEXT: stw 3, -4(1) +; NOVSX-NEXT: lwz 3, -40(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: isellt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -28(1) +; NOVSX-NEXT: stw 3, -8(1) +; NOVSX-NEXT: lwz 3, -44(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: isellt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -32(1) +; NOVSX-NEXT: stw 3, -12(1) +; NOVSX-NEXT: lwz 3, -48(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: isellt 3, 4, 3 +; NOVSX-NEXT: stw 3, -16(1) +; NOVSX-NEXT: addi 3, 1, -16 +; NOVSX-NEXT: lvx 5, 0, 3 ; NOVSX-NEXT: vcmpeqfp 3, 5, 3 +; NOVSX-NEXT: vsel 2, 5, 2, 4 ; NOVSX-NEXT: vsel 2, 5, 2, 3 ; NOVSX-NEXT: blr ; @@ -330,24 +308,41 @@ entry: define <4 x float> @v4f32_maximum(<4 x float> %a, <4 x float> %b) { ; NOVSX-LABEL: v4f32_maximum: ; NOVSX: # %bb.0: # %entry -; NOVSX-NEXT: vcmpeqfp 5, 3, 3 -; NOVSX-NEXT: vcmpeqfp 0, 2, 2 -; NOVSX-NEXT: addis 3, 2, .LCPI5_0@toc@ha -; NOVSX-NEXT: addi 3, 3, .LCPI5_0@toc@l -; NOVSX-NEXT: vnot 5, 5 -; NOVSX-NEXT: vnot 0, 0 -; NOVSX-NEXT: vcmpgtfp 4, 2, 3 -; NOVSX-NEXT: vor 5, 0, 5 -; NOVSX-NEXT: lvx 0, 0, 3 -; NOVSX-NEXT: vsel 4, 3, 2, 4 -; NOVSX-NEXT: vsel 4, 4, 0, 5 -; NOVSX-NEXT: vxor 5, 5, 5 -; NOVSX-NEXT: vcmpequw 0, 2, 5 -; NOVSX-NEXT: vsel 2, 4, 2, 0 -; NOVSX-NEXT: vcmpequw 0, 3, 5 -; NOVSX-NEXT: vsel 2, 2, 3, 0 -; NOVSX-NEXT: vcmpeqfp 3, 4, 5 -; NOVSX-NEXT: vsel 2, 4, 2, 3 +; NOVSX-NEXT: vcmpeqfp 4, 3, 3 +; NOVSX-NEXT: addi 3, 1, -48 +; NOVSX-NEXT: vnot 4, 4 +; NOVSX-NEXT: stvx 3, 0, 3 +; NOVSX-NEXT: addi 3, 1, -32 +; NOVSX-NEXT: vsel 2, 2, 3, 4 +; NOVSX-NEXT: vxor 3, 3, 3 +; NOVSX-NEXT: stvx 2, 0, 3 +; NOVSX-NEXT: vcmpgtsw 4, 3, 2 +; NOVSX-NEXT: lwz 3, -36(1) +; NOVSX-NEXT: lwz 4, -20(1) +; NOVSX-NEXT: vnot 4, 4 +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: iselgt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -24(1) +; NOVSX-NEXT: stw 3, -4(1) +; NOVSX-NEXT: lwz 3, -40(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: iselgt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -28(1) +; NOVSX-NEXT: stw 3, -8(1) +; NOVSX-NEXT: lwz 3, -44(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: iselgt 3, 4, 3 +; NOVSX-NEXT: lwz 4, -32(1) +; NOVSX-NEXT: stw 3, -12(1) +; NOVSX-NEXT: lwz 3, -48(1) +; NOVSX-NEXT: cmplw 4, 3 +; NOVSX-NEXT: iselgt 3, 4, 3 +; NOVSX-NEXT: stw 3, -16(1) +; NOVSX-NEXT: addi 3, 1, -16 +; NOVSX-NEXT: lvx 5, 0, 3 +; NOVSX-NEXT: vcmpeqfp 3, 5, 3 +; NOVSX-NEXT: vsel 2, 5, 2, 4 +; NOVSX-NEXT: vsel 2, 5, 2, 3 ; NOVSX-NEXT: blr ; ; VSX-LABEL: v4f32_maximum: @@ -384,79 +379,56 @@ entry: define <2 x double> @v2f64_minimum(<2 x double> %a, <2 x double> %b) { ; NOVSX-LABEL: v2f64_minimum: ; NOVSX: # %bb.0: # %entry -; NOVSX-NEXT: fcmpu 0, 1, 3 -; NOVSX-NEXT: fmr 6, 1 -; NOVSX-NEXT: stfd 4, -16(1) -; NOVSX-NEXT: stfd 2, -8(1) -; NOVSX-NEXT: stfd 3, -32(1) -; NOVSX-NEXT: stfd 1, -24(1) -; NOVSX-NEXT: bc 12, 0, .LBB6_2 +; NOVSX-NEXT: fcmpu 0, 3, 3 +; NOVSX-NEXT: fmr 0, 2 +; NOVSX-NEXT: fmr 2, 1 +; NOVSX-NEXT: fmr 1, 3 +; NOVSX-NEXT: bc 12, 3, .LBB6_2 ; NOVSX-NEXT: # %bb.1: # %entry -; NOVSX-NEXT: fmr 6, 3 +; NOVSX-NEXT: fmr 1, 2 ; NOVSX-NEXT: .LBB6_2: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI6_0@toc@ha -; NOVSX-NEXT: ld 4, -24(1) -; NOVSX-NEXT: lfs 0, .LCPI6_0@toc@l(3) -; NOVSX-NEXT: fmr 5, 0 +; NOVSX-NEXT: fcmpu 0, 4, 4 +; NOVSX-NEXT: fmr 2, 4 +; NOVSX-NEXT: stfd 1, -8(1) ; NOVSX-NEXT: bc 12, 3, .LBB6_4 ; NOVSX-NEXT: # %bb.3: # %entry -; NOVSX-NEXT: fmr 5, 6 +; NOVSX-NEXT: fmr 2, 0 ; NOVSX-NEXT: .LBB6_4: # %entry -; NOVSX-NEXT: li 3, 1 -; NOVSX-NEXT: ld 5, -32(1) -; NOVSX-NEXT: rldic 3, 3, 63, 0 -; NOVSX-NEXT: cmpd 4, 3 -; NOVSX-NEXT: bc 12, 2, .LBB6_6 +; NOVSX-NEXT: fcmpu 0, 1, 3 +; NOVSX-NEXT: fmr 5, 1 +; NOVSX-NEXT: stfd 2, -16(1) +; NOVSX-NEXT: cror 20, 0, 3 +; NOVSX-NEXT: bc 12, 20, .LBB6_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 5 +; NOVSX-NEXT: fmr 5, 3 ; NOVSX-NEXT: .LBB6_6: # %entry -; NOVSX-NEXT: cmpd 5, 3 -; NOVSX-NEXT: bc 12, 2, .LBB6_8 +; NOVSX-NEXT: addis 3, 2, .LCPI6_0@toc@ha +; NOVSX-NEXT: lfs 0, .LCPI6_0@toc@l(3) +; NOVSX-NEXT: ld 3, -8(1) +; NOVSX-NEXT: fcmpu 0, 5, 0 +; NOVSX-NEXT: bc 4, 2, .LBB6_8 ; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 3, 1 +; NOVSX-NEXT: cmpdi 3, 0 +; NOVSX-NEXT: bc 12, 0, .LBB6_9 ; NOVSX-NEXT: .LBB6_8: # %entry -; NOVSX-NEXT: addis 4, 2, .LCPI6_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI6_1@toc@l(4) -; NOVSX-NEXT: fcmpu 0, 5, 1 -; NOVSX-NEXT: bc 12, 2, .LBB6_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 3, 5 -; NOVSX-NEXT: .LBB6_10: # %entry +; NOVSX-NEXT: fmr 1, 5 +; NOVSX-NEXT: .LBB6_9: # %entry ; NOVSX-NEXT: fcmpu 0, 2, 4 -; NOVSX-NEXT: fmr 5, 2 -; NOVSX-NEXT: bc 12, 0, .LBB6_12 -; NOVSX-NEXT: # %bb.11: # %entry -; NOVSX-NEXT: fmr 5, 4 -; NOVSX-NEXT: .LBB6_12: # %entry -; NOVSX-NEXT: ld 5, -8(1) -; NOVSX-NEXT: bc 12, 3, .LBB6_14 -; NOVSX-NEXT: # %bb.13: # %entry -; NOVSX-NEXT: fmr 0, 5 -; NOVSX-NEXT: .LBB6_14: # %entry -; NOVSX-NEXT: cmpd 5, 3 -; NOVSX-NEXT: ld 4, -16(1) -; NOVSX-NEXT: bc 4, 2, .LBB6_19 -; NOVSX-NEXT: # %bb.15: # %entry -; NOVSX-NEXT: cmpd 4, 3 -; NOVSX-NEXT: bc 4, 2, .LBB6_20 -; NOVSX-NEXT: .LBB6_16: # %entry -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB6_18 -; NOVSX-NEXT: .LBB6_17: # %entry -; NOVSX-NEXT: fmr 4, 0 -; NOVSX-NEXT: .LBB6_18: # %entry -; NOVSX-NEXT: fmr 1, 3 -; NOVSX-NEXT: fmr 2, 4 +; NOVSX-NEXT: fmr 3, 2 +; NOVSX-NEXT: cror 20, 0, 3 +; NOVSX-NEXT: bc 12, 20, .LBB6_11 +; NOVSX-NEXT: # %bb.10: # %entry +; NOVSX-NEXT: fmr 3, 4 +; NOVSX-NEXT: .LBB6_11: # %entry +; NOVSX-NEXT: fcmpu 0, 3, 0 +; NOVSX-NEXT: ld 3, -16(1) +; NOVSX-NEXT: bc 4, 2, .LBB6_13 +; NOVSX-NEXT: # %bb.12: # %entry +; NOVSX-NEXT: cmpdi 3, 0 +; NOVSX-NEXT: bclr 12, 0, 0 +; NOVSX-NEXT: .LBB6_13: # %entry +; NOVSX-NEXT: fmr 2, 3 ; NOVSX-NEXT: blr -; NOVSX-NEXT: .LBB6_19: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: cmpd 4, 3 -; NOVSX-NEXT: bc 12, 2, .LBB6_16 -; NOVSX-NEXT: .LBB6_20: # %entry -; NOVSX-NEXT: fmr 4, 2 -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 4, 2, .LBB6_17 -; NOVSX-NEXT: b .LBB6_18 ; ; VSX-LABEL: v2f64_minimum: ; VSX: # %bb.0: # %entry @@ -492,77 +464,56 @@ entry: define <2 x double> @v2f64_maximum(<2 x double> %a, <2 x double> %b) { ; NOVSX-LABEL: v2f64_maximum: ; NOVSX: # %bb.0: # %entry -; NOVSX-NEXT: fcmpu 0, 1, 3 -; NOVSX-NEXT: fmr 6, 1 -; NOVSX-NEXT: stfd 4, -16(1) -; NOVSX-NEXT: stfd 2, -8(1) -; NOVSX-NEXT: stfd 3, -32(1) -; NOVSX-NEXT: stfd 1, -24(1) -; NOVSX-NEXT: bc 12, 1, .LBB7_2 +; NOVSX-NEXT: fcmpu 0, 3, 3 +; NOVSX-NEXT: fmr 0, 2 +; NOVSX-NEXT: fmr 2, 1 +; NOVSX-NEXT: fmr 1, 3 +; NOVSX-NEXT: bc 12, 3, .LBB7_2 ; NOVSX-NEXT: # %bb.1: # %entry -; NOVSX-NEXT: fmr 6, 3 +; NOVSX-NEXT: fmr 1, 2 ; NOVSX-NEXT: .LBB7_2: # %entry -; NOVSX-NEXT: addis 4, 2, .LCPI7_0@toc@ha -; NOVSX-NEXT: ld 3, -24(1) -; NOVSX-NEXT: lfs 0, .LCPI7_0@toc@l(4) -; NOVSX-NEXT: fmr 5, 0 +; NOVSX-NEXT: fcmpu 0, 4, 4 +; NOVSX-NEXT: fmr 2, 4 +; NOVSX-NEXT: stfd 1, -8(1) ; NOVSX-NEXT: bc 12, 3, .LBB7_4 ; NOVSX-NEXT: # %bb.3: # %entry -; NOVSX-NEXT: fmr 5, 6 +; NOVSX-NEXT: fmr 2, 0 ; NOVSX-NEXT: .LBB7_4: # %entry -; NOVSX-NEXT: cmpdi 3, 0 -; NOVSX-NEXT: ld 4, -32(1) -; NOVSX-NEXT: bc 12, 2, .LBB7_6 +; NOVSX-NEXT: fcmpu 0, 1, 3 +; NOVSX-NEXT: fmr 5, 1 +; NOVSX-NEXT: stfd 2, -16(1) +; NOVSX-NEXT: cror 20, 1, 3 +; NOVSX-NEXT: bc 12, 20, .LBB7_6 ; NOVSX-NEXT: # %bb.5: # %entry -; NOVSX-NEXT: fmr 1, 5 +; NOVSX-NEXT: fmr 5, 3 ; NOVSX-NEXT: .LBB7_6: # %entry -; NOVSX-NEXT: cmpdi 4, 0 -; NOVSX-NEXT: bc 12, 2, .LBB7_8 +; NOVSX-NEXT: addis 3, 2, .LCPI7_0@toc@ha +; NOVSX-NEXT: lfs 0, .LCPI7_0@toc@l(3) +; NOVSX-NEXT: ld 3, -8(1) +; NOVSX-NEXT: fcmpu 0, 5, 0 +; NOVSX-NEXT: bc 4, 2, .LBB7_8 ; NOVSX-NEXT: # %bb.7: # %entry -; NOVSX-NEXT: fmr 3, 1 +; NOVSX-NEXT: cmpdi 3, -1 +; NOVSX-NEXT: bc 12, 1, .LBB7_9 ; NOVSX-NEXT: .LBB7_8: # %entry -; NOVSX-NEXT: addis 3, 2, .LCPI7_1@toc@ha -; NOVSX-NEXT: lfs 1, .LCPI7_1@toc@l(3) -; NOVSX-NEXT: fcmpu 0, 5, 1 -; NOVSX-NEXT: bc 12, 2, .LBB7_10 -; NOVSX-NEXT: # %bb.9: # %entry -; NOVSX-NEXT: fmr 3, 5 -; NOVSX-NEXT: .LBB7_10: # %entry +; NOVSX-NEXT: fmr 1, 5 +; NOVSX-NEXT: .LBB7_9: # %entry ; NOVSX-NEXT: fcmpu 0, 2, 4 -; NOVSX-NEXT: fmr 5, 2 -; NOVSX-NEXT: bc 12, 1, .LBB7_12 -; NOVSX-NEXT: # %bb.11: # %entry -; NOVSX-NEXT: fmr 5, 4 -; NOVSX-NEXT: .LBB7_12: # %entry -; NOVSX-NEXT: ld 4, -8(1) -; NOVSX-NEXT: bc 12, 3, .LBB7_14 -; NOVSX-NEXT: # %bb.13: # %entry -; NOVSX-NEXT: fmr 0, 5 -; NOVSX-NEXT: .LBB7_14: # %entry -; NOVSX-NEXT: cmpdi 4, 0 +; NOVSX-NEXT: fmr 3, 2 +; NOVSX-NEXT: cror 20, 1, 3 +; NOVSX-NEXT: bc 12, 20, .LBB7_11 +; NOVSX-NEXT: # %bb.10: # %entry +; NOVSX-NEXT: fmr 3, 4 +; NOVSX-NEXT: .LBB7_11: # %entry +; NOVSX-NEXT: fcmpu 0, 3, 0 ; NOVSX-NEXT: ld 3, -16(1) -; NOVSX-NEXT: bc 4, 2, .LBB7_19 -; NOVSX-NEXT: # %bb.15: # %entry -; NOVSX-NEXT: cmpdi 3, 0 -; NOVSX-NEXT: bc 4, 2, .LBB7_20 -; NOVSX-NEXT: .LBB7_16: # %entry -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 12, 2, .LBB7_18 -; NOVSX-NEXT: .LBB7_17: # %entry -; NOVSX-NEXT: fmr 4, 0 -; NOVSX-NEXT: .LBB7_18: # %entry -; NOVSX-NEXT: fmr 1, 3 -; NOVSX-NEXT: fmr 2, 4 +; NOVSX-NEXT: bc 4, 2, .LBB7_13 +; NOVSX-NEXT: # %bb.12: # %entry +; NOVSX-NEXT: cmpdi 3, -1 +; NOVSX-NEXT: bclr 12, 1, 0 +; NOVSX-NEXT: .LBB7_13: # %entry +; NOVSX-NEXT: fmr 2, 3 ; NOVSX-NEXT: blr -; NOVSX-NEXT: .LBB7_19: # %entry -; NOVSX-NEXT: fmr 2, 0 -; NOVSX-NEXT: cmpdi 3, 0 -; NOVSX-NEXT: bc 12, 2, .LBB7_16 -; NOVSX-NEXT: .LBB7_20: # %entry -; NOVSX-NEXT: fmr 4, 2 -; NOVSX-NEXT: fcmpu 0, 0, 1 -; NOVSX-NEXT: bc 4, 2, .LBB7_17 -; NOVSX-NEXT: b .LBB7_18 ; ; VSX-LABEL: v2f64_maximum: ; VSX: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll index dedfe22240201..3d6299068050e 100644 --- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll @@ -1786,358 +1786,521 @@ define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { ; ; AVX512F-LABEL: test_fmaximum_v4f16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbp -; AVX512F-NEXT: pushq %r15 -; AVX512F-NEXT: pushq %r14 -; AVX512F-NEXT: pushq %r13 -; AVX512F-NEXT: pushq %r12 -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512F-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512F-NEXT: vmovdqa %xmm0, %xmm4 +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm0 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512F-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm3 +; AVX512F-NEXT: vucomiss %xmm3, %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX512F-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm0 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] +; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512F-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm3, %xmm6 +; AVX512F-NEXT: vcvtph2ps %xmm6, %xmm3 +; AVX512F-NEXT: vucomiss %xmm3, %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512F-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm0 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512F-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm5, %xmm7 +; AVX512F-NEXT: vcvtph2ps %xmm7, %xmm5 +; AVX512F-NEXT: vucomiss %xmm5, %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm0 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm8 = xmm4[1,0] +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX512F-NEXT: vmovss %xmm0, %xmm8, %xmm8 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm8, %xmm9 +; AVX512F-NEXT: vcvtph2ps %xmm9, %xmm8 +; AVX512F-NEXT: vucomiss %xmm8, %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm8 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; AVX512F-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm12 +; AVX512F-NEXT: vucomiss %xmm12, %xmm12 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpsrlq $48, %xmm4, %xmm0 +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm11 +; AVX512F-NEXT: vcvtph2ps %xmm11, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm12 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm0, %xmm12, %xmm12 {%k1} +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm13 +; AVX512F-NEXT: vucomiss %xmm13, %xmm13 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512F-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm14 +; AVX512F-NEXT: vcvtph2ps %xmm14, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm13 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm0, %xmm13, %xmm13 {%k1} +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512F-NEXT: vucomiss %xmm0, %xmm0 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm15 +; AVX512F-NEXT: vmovss %xmm0, %xmm15, %xmm15 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm15, %xmm15 +; AVX512F-NEXT: vcvtph2ps %xmm15, %xmm1 +; AVX512F-NEXT: vucomiss %xmm1, %xmm0 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512F-NEXT: vpsrld $16, %xmm2, %xmm1 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512F-NEXT: vucomiss %xmm1, %xmm1 +; AVX512F-NEXT: setp %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpsrld $16, %xmm4, %xmm2 +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm2, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm2 +; AVX512F-NEXT: vucomiss %xmm2, %xmm1 +; AVX512F-NEXT: setb %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512F-NEXT: vcvtps2ph $4, %xmm12, %xmm2 +; AVX512F-NEXT: vcvtps2ph $4, %xmm13, %xmm12 +; AVX512F-NEXT: vcvtps2ph $4, %xmm0, %xmm13 +; AVX512F-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512F-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload +; AVX512F-NEXT: # xmm3 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512F-NEXT: vpcmpgtw %xmm3, %xmm4, %xmm4 +; AVX512F-NEXT: vpblendvb %xmm4, %xmm0, %xmm3, %xmm4 +; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512F-NEXT: xorl %eax, %eax -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 +; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 ; AVX512F-NEXT: movl $65535, %ecx # imm = 0xFFFF ; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovpl %ecx, %edx -; AVX512F-NEXT: movl $0, %edi -; AVX512F-NEXT: cmoval %ecx, %edi -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 +; AVX512F-NEXT: cmovel %ecx, %edx +; AVX512F-NEXT: vcvtph2ps %xmm13, %xmm1 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 ; AVX512F-NEXT: movl $0, %esi -; AVX512F-NEXT: cmovpl %ecx, %esi -; AVX512F-NEXT: movl $0, %r9d -; AVX512F-NEXT: cmoval %ecx, %r9d -; AVX512F-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 -; AVX512F-NEXT: movl $0, %r8d -; AVX512F-NEXT: cmovpl %ecx, %r8d -; AVX512F-NEXT: movl $0, %r11d -; AVX512F-NEXT: cmoval %ecx, %r11d -; AVX512F-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 -; AVX512F-NEXT: movl $0, %r10d -; AVX512F-NEXT: cmovpl %ecx, %r10d -; AVX512F-NEXT: movl $0, %ebp -; AVX512F-NEXT: cmoval %ecx, %ebp -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 -; AVX512F-NEXT: movl $0, %ebx -; AVX512F-NEXT: cmovpl %ecx, %ebx -; AVX512F-NEXT: movl $0, %r14d -; AVX512F-NEXT: cmoval %ecx, %r14d -; AVX512F-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512F-NEXT: vpsrld $16, %xmm0, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 -; AVX512F-NEXT: movl $0, %r15d -; AVX512F-NEXT: cmovpl %ecx, %r15d -; AVX512F-NEXT: movl $0, %r12d -; AVX512F-NEXT: cmoval %ecx, %r12d -; AVX512F-NEXT: vcvtph2ps %xmm1, %xmm2 -; AVX512F-NEXT: vcvtph2ps %xmm0, %xmm3 -; AVX512F-NEXT: vucomiss %xmm2, %xmm3 -; AVX512F-NEXT: movl $0, %r13d -; AVX512F-NEXT: cmoval %ecx, %r13d -; AVX512F-NEXT: vmovd %r13d, %xmm2 -; AVX512F-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2 -; AVX512F-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2 +; AVX512F-NEXT: cmovel %ecx, %esi +; AVX512F-NEXT: vcvtph2ps %xmm12, %xmm1 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 ; AVX512F-NEXT: movl $0, %edi -; AVX512F-NEXT: cmovpl %ecx, %edi -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512F-NEXT: vucomiss %xmm3, %xmm4 +; AVX512F-NEXT: cmovel %ecx, %edi +; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm1 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: movl $0, %r8d +; AVX512F-NEXT: cmovel %ecx, %r8d +; AVX512F-NEXT: vcvtph2ps %xmm8, %xmm1 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 ; AVX512F-NEXT: movl $0, %r9d -; AVX512F-NEXT: cmoval %ecx, %r9d -; AVX512F-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2 -; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2 -; AVX512F-NEXT: vmovd %edi, %xmm3 -; AVX512F-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3 -; AVX512F-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3 -; AVX512F-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3 -; AVX512F-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3 -; AVX512F-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3 -; AVX512F-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: movl $0, %edx -; AVX512F-NEXT: cmovpl %ecx, %edx -; AVX512F-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; AVX512F-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpsrld $16, %xmm2, %xmm3 -; AVX512F-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vucomiss %xmm4, %xmm3 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vcvtph2ps %xmm2, %xmm3 -; AVX512F-NEXT: vucomiss %xmm4, %xmm3 -; AVX512F-NEXT: movl $65535, %esi # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %esi -; AVX512F-NEXT: cmovpl %eax, %esi -; AVX512F-NEXT: vmovd %esi, %xmm3 -; AVX512F-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrlq $48, %xmm2, %xmm5 -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512F-NEXT: cmovnel %eax, %edx -; AVX512F-NEXT: cmovpl %eax, %edx -; AVX512F-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512F-NEXT: vucomiss %xmm4, %xmm5 -; AVX512F-NEXT: cmovnel %eax, %ecx -; AVX512F-NEXT: cmovpl %eax, %ecx -; AVX512F-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 -; AVX512F-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512F-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5 -; AVX512F-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4 -; AVX512F-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: popq %rbx -; AVX512F-NEXT: popq %r12 -; AVX512F-NEXT: popq %r13 -; AVX512F-NEXT: popq %r14 -; AVX512F-NEXT: popq %r15 -; AVX512F-NEXT: popq %rbp +; AVX512F-NEXT: cmovel %ecx, %r9d +; AVX512F-NEXT: vcvtph2ps %xmm5, %xmm1 +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: movl $0, %r10d +; AVX512F-NEXT: cmovel %ecx, %r10d +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: movl $0, %r11d +; AVX512F-NEXT: cmovel %ecx, %r11d +; AVX512F-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512F-NEXT: vucomiss %xmm3, %xmm1 +; AVX512F-NEXT: vmovd %esi, %xmm1 +; AVX512F-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 +; AVX512F-NEXT: vpinsrw $6, %r11d, %xmm1, %xmm1 +; AVX512F-NEXT: cmovel %ecx, %eax +; AVX512F-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 ; AVX512F-NEXT: retq ; ; AVX512DQ-LABEL: test_fmaximum_v4f16: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: pushq %rbp -; AVX512DQ-NEXT: pushq %r15 -; AVX512DQ-NEXT: pushq %r14 -; AVX512DQ-NEXT: pushq %r13 -; AVX512DQ-NEXT: pushq %r12 -; AVX512DQ-NEXT: pushq %rbx -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm4 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm0 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm4[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vcvtph2ps %xmm1, %xmm3 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm0 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm1 +; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm0 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] ; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm3, %xmm6 +; AVX512DQ-NEXT: vcvtph2ps %xmm6, %xmm3 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm0 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm0 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm5 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm5, %xmm7 +; AVX512DQ-NEXT: vcvtph2ps %xmm7, %xmm5 +; AVX512DQ-NEXT: vucomiss %xmm5, %xmm0 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm5 +; AVX512DQ-NEXT: vshufpd {{.*#+}} xmm0 = xmm2[1,0] +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm0 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vshufpd {{.*#+}} xmm8 = xmm4[1,0] +; AVX512DQ-NEXT: vcvtph2ps %xmm8, %xmm8 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm8, %xmm8 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm8, %xmm9 +; AVX512DQ-NEXT: vcvtph2ps %xmm9, %xmm8 +; AVX512DQ-NEXT: vucomiss %xmm8, %xmm0 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm8 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm0[0],xmm10[0],xmm0[1],xmm10[1] +; AVX512DQ-NEXT: vpsrlq $48, %xmm2, %xmm0 +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm12 +; AVX512DQ-NEXT: vucomiss %xmm12, %xmm12 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpsrlq $48, %xmm4, %xmm0 +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm11 +; AVX512DQ-NEXT: vcvtph2ps %xmm11, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm12 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm12, %xmm12 {%k1} +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm13 +; AVX512DQ-NEXT: vucomiss %xmm13, %xmm13 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] +; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512DQ-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm14 +; AVX512DQ-NEXT: vcvtph2ps %xmm14, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm13 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm13, %xmm13 {%k1} +; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm0 +; AVX512DQ-NEXT: vucomiss %xmm0, %xmm0 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vcvtph2ps %xmm4, %xmm15 +; AVX512DQ-NEXT: vmovss %xmm0, %xmm15, %xmm15 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm15, %xmm15 +; AVX512DQ-NEXT: vcvtph2ps %xmm15, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm1, %xmm0 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm1 +; AVX512DQ-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm1, %xmm1 +; AVX512DQ-NEXT: setp %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vpsrld $16, %xmm4, %xmm2 +; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512DQ-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm2, %xmm4 +; AVX512DQ-NEXT: vcvtph2ps %xmm4, %xmm2 +; AVX512DQ-NEXT: vucomiss %xmm2, %xmm1 +; AVX512DQ-NEXT: setb %al +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm12, %xmm2 +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm13, %xmm12 +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm0, %xmm13 +; AVX512DQ-NEXT: vcvtps2ph $4, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm10[0] +; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm3 # 16-byte Folded Reload +; AVX512DQ-NEXT: # xmm3 = xmm6[0],mem[0],xmm6[1],mem[1],xmm6[2],mem[2],xmm6[3],mem[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] +; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] +; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] +; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpcmpgtw %xmm3, %xmm4, %xmm4 +; AVX512DQ-NEXT: vpblendvb %xmm4, %xmm0, %xmm3, %xmm4 +; AVX512DQ-NEXT: vcvtph2ps %xmm1, %xmm1 ; AVX512DQ-NEXT: xorl %eax, %eax -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 +; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 ; AVX512DQ-NEXT: movl $65535, %ecx # imm = 0xFFFF ; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovpl %ecx, %edx -; AVX512DQ-NEXT: movl $0, %edi -; AVX512DQ-NEXT: cmoval %ecx, %edi -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 +; AVX512DQ-NEXT: cmovel %ecx, %edx +; AVX512DQ-NEXT: vcvtph2ps %xmm13, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 ; AVX512DQ-NEXT: movl $0, %esi -; AVX512DQ-NEXT: cmovpl %ecx, %esi -; AVX512DQ-NEXT: movl $0, %r9d -; AVX512DQ-NEXT: cmoval %ecx, %r9d -; AVX512DQ-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 -; AVX512DQ-NEXT: movl $0, %r8d -; AVX512DQ-NEXT: cmovpl %ecx, %r8d -; AVX512DQ-NEXT: movl $0, %r11d -; AVX512DQ-NEXT: cmoval %ecx, %r11d -; AVX512DQ-NEXT: vpsrlq $48, %xmm1, %xmm2 -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrlq $48, %xmm0, %xmm3 -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 -; AVX512DQ-NEXT: movl $0, %r10d -; AVX512DQ-NEXT: cmovpl %ecx, %r10d -; AVX512DQ-NEXT: movl $0, %ebp -; AVX512DQ-NEXT: cmoval %ecx, %ebp -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 -; AVX512DQ-NEXT: movl $0, %ebx -; AVX512DQ-NEXT: cmovpl %ecx, %ebx -; AVX512DQ-NEXT: movl $0, %r14d -; AVX512DQ-NEXT: cmoval %ecx, %r14d -; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm2 -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm3 -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 -; AVX512DQ-NEXT: movl $0, %r15d -; AVX512DQ-NEXT: cmovpl %ecx, %r15d -; AVX512DQ-NEXT: movl $0, %r12d -; AVX512DQ-NEXT: cmoval %ecx, %r12d -; AVX512DQ-NEXT: vcvtph2ps %xmm1, %xmm2 -; AVX512DQ-NEXT: vcvtph2ps %xmm0, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm2, %xmm3 -; AVX512DQ-NEXT: movl $0, %r13d -; AVX512DQ-NEXT: cmoval %ecx, %r13d -; AVX512DQ-NEXT: vmovd %r13d, %xmm2 -; AVX512DQ-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2 +; AVX512DQ-NEXT: cmovel %ecx, %esi +; AVX512DQ-NEXT: vcvtph2ps %xmm12, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 ; AVX512DQ-NEXT: movl $0, %edi -; AVX512DQ-NEXT: cmovpl %ecx, %edi -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm4, %xmm4 -; AVX512DQ-NEXT: vucomiss %xmm3, %xmm4 +; AVX512DQ-NEXT: cmovel %ecx, %edi +; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: movl $0, %r8d +; AVX512DQ-NEXT: cmovel %ecx, %r8d +; AVX512DQ-NEXT: vcvtph2ps %xmm8, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 ; AVX512DQ-NEXT: movl $0, %r9d -; AVX512DQ-NEXT: cmoval %ecx, %r9d -; AVX512DQ-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2 -; AVX512DQ-NEXT: vmovd %edi, %xmm3 -; AVX512DQ-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: movl $0, %edx -; AVX512DQ-NEXT: cmovpl %ecx, %edx -; AVX512DQ-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; AVX512DQ-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm3 -; AVX512DQ-NEXT: vcvtph2ps %xmm3, %xmm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm3 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vcvtph2ps %xmm2, %xmm3 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm3 -; AVX512DQ-NEXT: movl $65535, %esi # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %esi -; AVX512DQ-NEXT: cmovpl %eax, %esi -; AVX512DQ-NEXT: vmovd %esi, %xmm3 -; AVX512DQ-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrlq $48, %xmm2, %xmm5 -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: movl $65535, %edx # imm = 0xFFFF -; AVX512DQ-NEXT: cmovnel %eax, %edx -; AVX512DQ-NEXT: cmovpl %eax, %edx -; AVX512DQ-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm5 -; AVX512DQ-NEXT: vucomiss %xmm4, %xmm5 -; AVX512DQ-NEXT: cmovnel %eax, %ecx -; AVX512DQ-NEXT: cmovpl %eax, %ecx -; AVX512DQ-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512DQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5 -; AVX512DQ-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4 -; AVX512DQ-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0 -; AVX512DQ-NEXT: popq %rbx -; AVX512DQ-NEXT: popq %r12 -; AVX512DQ-NEXT: popq %r13 -; AVX512DQ-NEXT: popq %r14 -; AVX512DQ-NEXT: popq %r15 -; AVX512DQ-NEXT: popq %rbp +; AVX512DQ-NEXT: cmovel %ecx, %r9d +; AVX512DQ-NEXT: vcvtph2ps %xmm5, %xmm1 +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: movl $0, %r10d +; AVX512DQ-NEXT: cmovel %ecx, %r10d +; AVX512DQ-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: movl $0, %r11d +; AVX512DQ-NEXT: cmovel %ecx, %r11d +; AVX512DQ-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload +; AVX512DQ-NEXT: vucomiss %xmm3, %xmm1 +; AVX512DQ-NEXT: vmovd %esi, %xmm1 +; AVX512DQ-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrw $6, %r11d, %xmm1, %xmm1 +; AVX512DQ-NEXT: cmovel %ecx, %eax +; AVX512DQ-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 ; AVX512DQ-NEXT: retq ; ; AVX512BF16-LABEL: test_fmaximum_v4f16: ; AVX512BF16: # %bb.0: -; AVX512BF16-NEXT: vcvtph2ps %xmm0, %ymm3 -; AVX512BF16-NEXT: vcvtph2ps %xmm1, %ymm4 -; AVX512BF16-NEXT: vcmpltps %ymm3, %ymm4, %k1 -; AVX512BF16-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} -; AVX512BF16-NEXT: vcmpunordps %ymm4, %ymm3, %k1 -; AVX512BF16-NEXT: vpbroadcastw {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; AVX512BF16-NEXT: vmovdqu16 %xmm3, %xmm2 {%k1} -; AVX512BF16-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; AVX512BF16-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} -; AVX512BF16-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; AVX512BF16-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} -; AVX512BF16-NEXT: vcvtph2ps %xmm2, %ymm1 +; AVX512BF16-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BF16-NEXT: vcvtph2ps %xmm2, %xmm3 +; AVX512BF16-NEXT: vucomiss %xmm3, %xmm3 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BF16-NEXT: vcvtph2ps %xmm2, %xmm2 +; AVX512BF16-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm2, %xmm2 +; AVX512BF16-NEXT: vcvtph2ps %xmm2, %xmm4 +; AVX512BF16-NEXT: vucomiss %xmm4, %xmm3 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm4, %xmm3, %xmm3 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm3, %xmm4 +; AVX512BF16-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX512BF16-NEXT: vcvtph2ps %xmm3, %xmm5 +; AVX512BF16-NEXT: vucomiss %xmm5, %xmm5 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BF16-NEXT: vcvtph2ps %xmm3, %xmm3 +; AVX512BF16-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm3, %xmm3 +; AVX512BF16-NEXT: vcvtph2ps %xmm3, %xmm6 +; AVX512BF16-NEXT: vucomiss %xmm6, %xmm5 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm6, %xmm5, %xmm5 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BF16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BF16-NEXT: vcvtph2ps %xmm4, %xmm5 +; AVX512BF16-NEXT: vucomiss %xmm5, %xmm5 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BF16-NEXT: vcvtph2ps %xmm4, %xmm4 +; AVX512BF16-NEXT: vmovss %xmm5, %xmm4, %xmm4 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm4, %xmm4 +; AVX512BF16-NEXT: vcvtph2ps %xmm4, %xmm7 +; AVX512BF16-NEXT: vucomiss %xmm7, %xmm5 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm7, %xmm5, %xmm5 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm5, %xmm7 +; AVX512BF16-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0] +; AVX512BF16-NEXT: vcvtph2ps %xmm5, %xmm8 +; AVX512BF16-NEXT: vucomiss %xmm8, %xmm8 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] +; AVX512BF16-NEXT: vcvtph2ps %xmm5, %xmm5 +; AVX512BF16-NEXT: vmovss %xmm8, %xmm5, %xmm5 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm5, %xmm5 +; AVX512BF16-NEXT: vcvtph2ps %xmm5, %xmm9 +; AVX512BF16-NEXT: vucomiss %xmm9, %xmm8 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm9, %xmm8, %xmm8 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm8, %xmm8 +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512BF16-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] +; AVX512BF16-NEXT: vpsrlq $48, %xmm1, %xmm7 +; AVX512BF16-NEXT: vcvtph2ps %xmm7, %xmm8 +; AVX512BF16-NEXT: vucomiss %xmm8, %xmm8 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vpsrlq $48, %xmm0, %xmm7 +; AVX512BF16-NEXT: vcvtph2ps %xmm7, %xmm7 +; AVX512BF16-NEXT: vmovss %xmm8, %xmm7, %xmm7 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm7, %xmm7 +; AVX512BF16-NEXT: vcvtph2ps %xmm7, %xmm9 +; AVX512BF16-NEXT: vucomiss %xmm9, %xmm8 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm9, %xmm8, %xmm8 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm8, %xmm8 +; AVX512BF16-NEXT: vmovshdup {{.*#+}} xmm9 = xmm1[1,1,3,3] +; AVX512BF16-NEXT: vcvtph2ps %xmm9, %xmm9 +; AVX512BF16-NEXT: vucomiss %xmm9, %xmm9 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovshdup {{.*#+}} xmm10 = xmm0[1,1,3,3] +; AVX512BF16-NEXT: vcvtph2ps %xmm10, %xmm10 +; AVX512BF16-NEXT: vmovss %xmm9, %xmm10, %xmm10 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm10, %xmm10 +; AVX512BF16-NEXT: vcvtph2ps %xmm10, %xmm11 +; AVX512BF16-NEXT: vucomiss %xmm11, %xmm9 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm11, %xmm9, %xmm9 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] +; AVX512BF16-NEXT: vcvtph2ps %xmm1, %xmm9 +; AVX512BF16-NEXT: vucomiss %xmm9, %xmm9 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vcvtph2ps %xmm0, %xmm11 +; AVX512BF16-NEXT: vmovss %xmm9, %xmm11, %xmm11 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm11, %xmm11 +; AVX512BF16-NEXT: vcvtph2ps %xmm11, %xmm12 +; AVX512BF16-NEXT: vucomiss %xmm12, %xmm9 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm12, %xmm9, %xmm9 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512BF16-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX512BF16-NEXT: vucomiss %xmm1, %xmm1 +; AVX512BF16-NEXT: setp %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512BF16-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX512BF16-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm0, %xmm12 +; AVX512BF16-NEXT: vcvtph2ps %xmm12, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BF16-NEXT: setb %al +; AVX512BF16-NEXT: kmovd %eax, %k1 +; AVX512BF16-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} +; AVX512BF16-NEXT: vcvtps2ph $4, %xmm1, %xmm0 +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; AVX512BF16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; AVX512BF16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512BF16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3] +; AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3] +; AVX512BF16-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX512BF16-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512BF16-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BF16-NEXT: vpcmpnltw %xmm2, %xmm1, %k1 +; AVX512BF16-NEXT: vpblendmw %xmm1, %xmm0, %xmm1 {%k1} +; AVX512BF16-NEXT: vcvtph2ps %xmm0, %ymm2 ; AVX512BF16-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BF16-NEXT: vcmpeqps %ymm3, %ymm1, %k1 -; AVX512BF16-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} -; AVX512BF16-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BF16-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; AVX512BF16-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} ; AVX512BF16-NEXT: vzeroupper ; AVX512BF16-NEXT: retq ; @@ -2813,24 +2976,158 @@ define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; ; AVX512BF16-LABEL: test_fmaximum_v4bf16: ; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rbp +; AVX512BF16-NEXT: .cfi_def_cfa_offset 16 +; AVX512BF16-NEXT: pushq %r15 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 24 +; AVX512BF16-NEXT: pushq %r14 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 32 +; AVX512BF16-NEXT: pushq %r13 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 40 +; AVX512BF16-NEXT: pushq %r12 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 48 +; AVX512BF16-NEXT: pushq %rbx +; AVX512BF16-NEXT: .cfi_def_cfa_offset 56 +; AVX512BF16-NEXT: .cfi_offset %rbx, -56 +; AVX512BF16-NEXT: .cfi_offset %r12, -48 +; AVX512BF16-NEXT: .cfi_offset %r13, -40 +; AVX512BF16-NEXT: .cfi_offset %r14, -32 +; AVX512BF16-NEXT: .cfi_offset %r15, -24 +; AVX512BF16-NEXT: .cfi_offset %rbp, -16 +; AVX512BF16-NEXT: vpextrw $7, %xmm0, %r10d +; AVX512BF16-NEXT: vpextrw $7, %xmm1, %ecx +; AVX512BF16-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BF16-NEXT: movl %ecx, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm2 +; AVX512BF16-NEXT: vucomiss %xmm2, %xmm2 +; AVX512BF16-NEXT: cmovpl %ecx, %r10d +; AVX512BF16-NEXT: vpextrw $6, %xmm0, %ecx +; AVX512BF16-NEXT: vpextrw $6, %xmm1, %edx +; AVX512BF16-NEXT: movl %edx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVX512BF16-NEXT: movl %edx, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm3 +; AVX512BF16-NEXT: vucomiss %xmm3, %xmm3 +; AVX512BF16-NEXT: cmovpl %edx, %ecx +; AVX512BF16-NEXT: vpextrw $5, %xmm0, %edx +; AVX512BF16-NEXT: vpextrw $5, %xmm1, %ebx +; AVX512BF16-NEXT: movl %ebx, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm4 +; AVX512BF16-NEXT: vucomiss %xmm4, %xmm4 +; AVX512BF16-NEXT: cmovpl %ebx, %edx +; AVX512BF16-NEXT: vpextrw $4, %xmm0, %esi +; AVX512BF16-NEXT: vpextrw $4, %xmm1, %ebp +; AVX512BF16-NEXT: movl %ebp, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm5 +; AVX512BF16-NEXT: vucomiss %xmm5, %xmm5 +; AVX512BF16-NEXT: cmovpl %ebp, %esi +; AVX512BF16-NEXT: vpextrw $3, %xmm0, %edi +; AVX512BF16-NEXT: vpextrw $3, %xmm1, %r14d +; AVX512BF16-NEXT: movl %r14d, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm6 +; AVX512BF16-NEXT: vucomiss %xmm6, %xmm6 +; AVX512BF16-NEXT: cmovpl %r14d, %edi +; AVX512BF16-NEXT: vpextrw $2, %xmm0, %r8d +; AVX512BF16-NEXT: vpextrw $2, %xmm1, %r15d +; AVX512BF16-NEXT: movl %r15d, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm7 +; AVX512BF16-NEXT: vucomiss %xmm7, %xmm7 +; AVX512BF16-NEXT: cmovpl %r15d, %r8d +; AVX512BF16-NEXT: vpextrw $1, %xmm0, %r11d +; AVX512BF16-NEXT: vpextrw $1, %xmm1, %r12d +; AVX512BF16-NEXT: movl %r12d, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm8 +; AVX512BF16-NEXT: vucomiss %xmm8, %xmm8 +; AVX512BF16-NEXT: cmovpl %r12d, %r11d +; AVX512BF16-NEXT: vmovd %xmm1, %r13d +; AVX512BF16-NEXT: movl %r13d, %eax +; AVX512BF16-NEXT: shll $16, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm1 +; AVX512BF16-NEXT: vucomiss %xmm1, %xmm1 +; AVX512BF16-NEXT: vmovd %xmm0, %eax +; AVX512BF16-NEXT: cmovpl %r13d, %eax +; AVX512BF16-NEXT: movl %r10d, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vmovd %eax, %xmm9 +; AVX512BF16-NEXT: vpinsrw $1, %r11d, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $2, %r8d, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $3, %edi, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $4, %esi, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $5, %edx, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $6, %ecx, %xmm9, %xmm9 +; AVX512BF16-NEXT: vpinsrw $7, %r10d, %xmm9, %xmm9 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm2 +; AVX512BF16-NEXT: cmovael {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 4-byte Folded Reload +; AVX512BF16-NEXT: movl %ecx, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm3 +; AVX512BF16-NEXT: cmovael {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 4-byte Folded Reload +; AVX512BF16-NEXT: movl %edx, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm4 +; AVX512BF16-NEXT: cmovael %ebx, %edx +; AVX512BF16-NEXT: movl %esi, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm5 +; AVX512BF16-NEXT: cmovael %ebp, %esi +; AVX512BF16-NEXT: movl %edi, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm6 +; AVX512BF16-NEXT: cmovael %r14d, %edi +; AVX512BF16-NEXT: movl %r8d, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm7 +; AVX512BF16-NEXT: cmovael %r15d, %r8d +; AVX512BF16-NEXT: movl %r11d, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm8 +; AVX512BF16-NEXT: cmovael %r12d, %r11d +; AVX512BF16-NEXT: movl %eax, %r9d +; AVX512BF16-NEXT: shll $16, %r9d +; AVX512BF16-NEXT: vmovd %r9d, %xmm0 +; AVX512BF16-NEXT: vucomiss %xmm0, %xmm1 +; AVX512BF16-NEXT: cmovael %r13d, %eax +; AVX512BF16-NEXT: vmovd %eax, %xmm0 +; AVX512BF16-NEXT: vpinsrw $1, %r11d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $2, %r8d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $3, %edi, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $4, %esi, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $5, %edx, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpinsrw $7, %r10d, %xmm0, %xmm0 +; AVX512BF16-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BF16-NEXT: vpcmpnltw %xmm1, %xmm9, %k1 +; AVX512BF16-NEXT: vpblendmw %xmm9, %xmm0, %xmm1 {%k1} ; AVX512BF16-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BF16-NEXT: vpslld $16, %ymm2, %ymm3 -; AVX512BF16-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BF16-NEXT: vpslld $16, %ymm2, %ymm4 -; AVX512BF16-NEXT: vcmpltps %ymm3, %ymm4, %k1 -; AVX512BF16-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} -; AVX512BF16-NEXT: vcmpunordps %ymm4, %ymm3, %k1 -; AVX512BF16-NEXT: vmovdqu16 {{.*#+}} xmm2 {%k1} = [32704,32704,32704,32704,32704,32704,32704,32704] -; AVX512BF16-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; AVX512BF16-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} -; AVX512BF16-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; AVX512BF16-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} -; AVX512BF16-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BF16-NEXT: vpslld $16, %ymm1, %ymm1 +; AVX512BF16-NEXT: vpslld $16, %ymm2, %ymm2 ; AVX512BF16-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX512BF16-NEXT: vcmpeqps %ymm3, %ymm1, %k1 -; AVX512BF16-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} -; AVX512BF16-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BF16-NEXT: vcmpeqps %ymm3, %ymm2, %k1 +; AVX512BF16-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} +; AVX512BF16-NEXT: popq %rbx +; AVX512BF16-NEXT: .cfi_def_cfa_offset 48 +; AVX512BF16-NEXT: popq %r12 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 40 +; AVX512BF16-NEXT: popq %r13 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 32 +; AVX512BF16-NEXT: popq %r14 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 24 +; AVX512BF16-NEXT: popq %r15 +; AVX512BF16-NEXT: .cfi_def_cfa_offset 16 +; AVX512BF16-NEXT: popq %rbp +; AVX512BF16-NEXT: .cfi_def_cfa_offset 8 ; AVX512BF16-NEXT: vzeroupper ; AVX512BF16-NEXT: retq ;