Skip to content

Commit 53ddeb4

Browse files
whytolearnwhyuuwangRKSimon
authored
[X86] Add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr (#156822)
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - add MMX/SSE/AVX PHADD/SUB & HADDPS/D intrinsics to be used in constexpr Fixes #155395 cover func: _mm_hadd_pi16 _mm_hadd_epi16 _mm256_hadd_epi16 _mm_hadd_pi32 _mm_hadd_epi32 _mm256_hadd_epi32 _mm_hadds_pi16 _mm_hadds_epi16 _mm256_hadds_epi16 _mm_hsub_pi16 _mm_hsub_epi16 _mm256_hsub_epi16 _mm_hsub_pi32 _mm_hsub_epi32 _mm256_hsub_epi32 _mm_hsubs_pi16 _mm_hsubs_epi16 _mm256_hsubs_epi16 _mm_hadd_pd _mm256_hadd_pd _mm_hadd_ps _mm256_hadd_ps _mm_hsub_pd _mm256_hsub_pd _mm_hsub_ps _mm256_hsub_ps --------- Co-authored-by: whyuuwang <[email protected]> Co-authored-by: Simon Pilgrim <[email protected]> Co-authored-by: Simon Pilgrim <[email protected]>
1 parent 69e0fd6 commit 53ddeb4

File tree

12 files changed

+448
-122
lines changed

12 files changed

+448
-122
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 29 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -110,19 +110,20 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
110110
}
111111

112112
let Features = "sse3" in {
113-
foreach Op = ["addsub", "hadd", "hsub"] in {
113+
foreach Op = ["addsub"] in {
114114
def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
115115
def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
116116
}
117117
}
118118

119-
let Features = "ssse3" in {
120-
foreach Op = ["phadd", "phsub"] in {
121-
def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
122-
def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
123-
def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
119+
let Features = "sse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
120+
foreach Op = ["hadd", "hsub"] in {
121+
def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">;
122+
def Op#pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>)">;
124123
}
124+
}
125125

126+
let Features = "ssse3" in {
126127
def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
127128
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
128129
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
@@ -137,7 +138,7 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
137138

138139
// AVX
139140
let Attributes = [Const, NoThrow, RequiredVectorWidth<256>], Features = "avx" in {
140-
foreach Op = ["addsub", "hadd", "hsub", "max", "min"] in {
141+
foreach Op = ["addsub", "max", "min"] in {
141142
def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
142143
def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
143144
}
@@ -316,6 +317,14 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
316317
def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
317318
}
318319

320+
let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
321+
foreach Op = ["phadd", "phsub"] in {
322+
def Op#w128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
323+
def Op#sw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
324+
def Op#d128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
325+
}
326+
}
327+
319328
let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
320329
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
321330
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
@@ -515,6 +524,11 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
515524
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
516525
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
517526
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
527+
528+
foreach Op = ["hadd", "hsub"] in {
529+
def Op#pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>)">;
530+
def Op#ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>)">;
531+
}
518532
}
519533

520534
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
@@ -592,12 +606,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
592606
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
593607
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
594608
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
595-
def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
596-
def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
597-
def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
598-
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
599-
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
600-
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
609+
601610
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
602611
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
603612
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
@@ -666,6 +675,13 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
666675
def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
667676
def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
668677

678+
def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
679+
def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
680+
def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
681+
def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
682+
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
683+
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
684+
669685
def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
670686
def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
671687
def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;

clang/lib/AST/ByteCode/InterpBuiltin.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2587,6 +2587,82 @@ static bool interp__builtin_ia32_pmul(
25872587
return true;
25882588
}
25892589

2590+
static bool interp_builtin_horizontal_int_binop(
2591+
InterpState &S, CodePtr OpPC, const CallExpr *Call,
2592+
llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
2593+
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
2594+
PrimType ElemT = *S.getContext().classify(VT->getElementType());
2595+
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();
2596+
2597+
const Pointer &RHS = S.Stk.pop<Pointer>();
2598+
const Pointer &LHS = S.Stk.pop<Pointer>();
2599+
const Pointer &Dst = S.Stk.peek<Pointer>();
2600+
unsigned NumElts = VT->getNumElements();
2601+
unsigned EltBits = S.getASTContext().getIntWidth(VT->getElementType());
2602+
unsigned EltsPerLane = 128 / EltBits;
2603+
unsigned Lanes = NumElts * EltBits / 128;
2604+
unsigned DestIndex = 0;
2605+
2606+
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2607+
unsigned LaneStart = Lane * EltsPerLane;
2608+
for (unsigned I = 0; I < EltsPerLane; I += 2) {
2609+
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
2610+
APSInt Elem1 = LHS.elem<T>(LaneStart + I).toAPSInt();
2611+
APSInt Elem2 = LHS.elem<T>(LaneStart + I + 1).toAPSInt();
2612+
APSInt ResL = APSInt(Fn(Elem1, Elem2), DestUnsigned);
2613+
Dst.elem<T>(DestIndex++) = static_cast<T>(ResL);
2614+
});
2615+
}
2616+
2617+
for (unsigned I = 0; I < EltsPerLane; I += 2) {
2618+
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
2619+
APSInt Elem1 = RHS.elem<T>(LaneStart + I).toAPSInt();
2620+
APSInt Elem2 = RHS.elem<T>(LaneStart + I + 1).toAPSInt();
2621+
APSInt ResR = APSInt(Fn(Elem1, Elem2), DestUnsigned);
2622+
Dst.elem<T>(DestIndex++) = static_cast<T>(ResR);
2623+
});
2624+
}
2625+
}
2626+
Dst.initializeAllElements();
2627+
return true;
2628+
}
2629+
2630+
static bool interp_builtin_horizontal_fp_binop(
2631+
InterpState &S, CodePtr OpPC, const CallExpr *Call,
2632+
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
2633+
llvm::RoundingMode)>
2634+
Fn) {
2635+
const Pointer &RHS = S.Stk.pop<Pointer>();
2636+
const Pointer &LHS = S.Stk.pop<Pointer>();
2637+
const Pointer &Dst = S.Stk.peek<Pointer>();
2638+
FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
2639+
llvm::RoundingMode RM = getRoundingMode(FPO);
2640+
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
2641+
2642+
unsigned NumElts = VT->getNumElements();
2643+
unsigned EltBits = S.getASTContext().getTypeSize(VT->getElementType());
2644+
unsigned NumLanes = NumElts * EltBits / 128;
2645+
unsigned NumElemsPerLane = NumElts / NumLanes;
2646+
unsigned HalfElemsPerLane = NumElemsPerLane / 2;
2647+
2648+
for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
2649+
using T = PrimConv<PT_Float>::T;
2650+
for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
2651+
APFloat Elem1 = LHS.elem<T>(L + (2 * E) + 0).getAPFloat();
2652+
APFloat Elem2 = LHS.elem<T>(L + (2 * E) + 1).getAPFloat();
2653+
Dst.elem<T>(L + E) = static_cast<T>(Fn(Elem1, Elem2, RM));
2654+
}
2655+
for (unsigned E = 0; E != HalfElemsPerLane; ++E) {
2656+
APFloat Elem1 = RHS.elem<T>(L + (2 * E) + 0).getAPFloat();
2657+
APFloat Elem2 = RHS.elem<T>(L + (2 * E) + 1).getAPFloat();
2658+
Dst.elem<T>(L + E + HalfElemsPerLane) =
2659+
static_cast<T>(Fn(Elem1, Elem2, RM));
2660+
}
2661+
}
2662+
Dst.initializeAllElements();
2663+
return true;
2664+
}
2665+
25902666
static bool interp__builtin_elementwise_triop_fp(
25912667
InterpState &S, CodePtr OpPC, const CallExpr *Call,
25922668
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
@@ -3665,6 +3741,53 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
36653741
case Builtin::BI__builtin_elementwise_min:
36663742
return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID);
36673743

3744+
case clang::X86::BI__builtin_ia32_phaddw128:
3745+
case clang::X86::BI__builtin_ia32_phaddw256:
3746+
case clang::X86::BI__builtin_ia32_phaddd128:
3747+
case clang::X86::BI__builtin_ia32_phaddd256:
3748+
return interp_builtin_horizontal_int_binop(
3749+
S, OpPC, Call,
3750+
[](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
3751+
case clang::X86::BI__builtin_ia32_phaddsw128:
3752+
case clang::X86::BI__builtin_ia32_phaddsw256:
3753+
return interp_builtin_horizontal_int_binop(
3754+
S, OpPC, Call,
3755+
[](const APSInt &LHS, const APSInt &RHS) { return LHS.sadd_sat(RHS); });
3756+
case clang::X86::BI__builtin_ia32_phsubw128:
3757+
case clang::X86::BI__builtin_ia32_phsubw256:
3758+
case clang::X86::BI__builtin_ia32_phsubd128:
3759+
case clang::X86::BI__builtin_ia32_phsubd256:
3760+
return interp_builtin_horizontal_int_binop(
3761+
S, OpPC, Call,
3762+
[](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
3763+
case clang::X86::BI__builtin_ia32_phsubsw128:
3764+
case clang::X86::BI__builtin_ia32_phsubsw256:
3765+
return interp_builtin_horizontal_int_binop(
3766+
S, OpPC, Call,
3767+
[](const APSInt &LHS, const APSInt &RHS) { return LHS.ssub_sat(RHS); });
3768+
case clang::X86::BI__builtin_ia32_haddpd:
3769+
case clang::X86::BI__builtin_ia32_haddps:
3770+
case clang::X86::BI__builtin_ia32_haddpd256:
3771+
case clang::X86::BI__builtin_ia32_haddps256:
3772+
return interp_builtin_horizontal_fp_binop(
3773+
S, OpPC, Call,
3774+
[](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
3775+
APFloat F = LHS;
3776+
F.add(RHS, RM);
3777+
return F;
3778+
});
3779+
case clang::X86::BI__builtin_ia32_hsubpd:
3780+
case clang::X86::BI__builtin_ia32_hsubps:
3781+
case clang::X86::BI__builtin_ia32_hsubpd256:
3782+
case clang::X86::BI__builtin_ia32_hsubps256:
3783+
return interp_builtin_horizontal_fp_binop(
3784+
S, OpPC, Call,
3785+
[](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
3786+
APFloat F = LHS;
3787+
F.subtract(RHS, RM);
3788+
return F;
3789+
});
3790+
36683791
case clang::X86::BI__builtin_ia32_pmuldq128:
36693792
case clang::X86::BI__builtin_ia32_pmuldq256:
36703793
case clang::X86::BI__builtin_ia32_pmuldq512:

clang/lib/AST/ExprConstant.cpp

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12381,6 +12381,169 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1238112381
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
1238212382
}
1238312383

12384+
case clang::X86::BI__builtin_ia32_phaddw128:
12385+
case clang::X86::BI__builtin_ia32_phaddw256:
12386+
case clang::X86::BI__builtin_ia32_phaddd128:
12387+
case clang::X86::BI__builtin_ia32_phaddd256:
12388+
case clang::X86::BI__builtin_ia32_phaddsw128:
12389+
case clang::X86::BI__builtin_ia32_phaddsw256:
12390+
12391+
case clang::X86::BI__builtin_ia32_phsubw128:
12392+
case clang::X86::BI__builtin_ia32_phsubw256:
12393+
case clang::X86::BI__builtin_ia32_phsubd128:
12394+
case clang::X86::BI__builtin_ia32_phsubd256:
12395+
case clang::X86::BI__builtin_ia32_phsubsw128:
12396+
case clang::X86::BI__builtin_ia32_phsubsw256: {
12397+
APValue SourceLHS, SourceRHS;
12398+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
12399+
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
12400+
return false;
12401+
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
12402+
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
12403+
12404+
unsigned NumElts = SourceLHS.getVectorLength();
12405+
unsigned EltBits = Info.Ctx.getIntWidth(DestEltTy);
12406+
unsigned EltsPerLane = 128 / EltBits;
12407+
SmallVector<APValue, 4> ResultElements;
12408+
ResultElements.reserve(NumElts);
12409+
12410+
for (unsigned LaneStart = 0; LaneStart != NumElts;
12411+
LaneStart += EltsPerLane) {
12412+
for (unsigned I = 0; I != EltsPerLane; I += 2) {
12413+
APSInt LHSA = SourceLHS.getVectorElt(LaneStart + I).getInt();
12414+
APSInt LHSB = SourceLHS.getVectorElt(LaneStart + I + 1).getInt();
12415+
switch (E->getBuiltinCallee()) {
12416+
case clang::X86::BI__builtin_ia32_phaddw128:
12417+
case clang::X86::BI__builtin_ia32_phaddw256:
12418+
case clang::X86::BI__builtin_ia32_phaddd128:
12419+
case clang::X86::BI__builtin_ia32_phaddd256: {
12420+
APSInt Res(LHSA + LHSB, DestUnsigned);
12421+
ResultElements.push_back(APValue(Res));
12422+
break;
12423+
}
12424+
case clang::X86::BI__builtin_ia32_phaddsw128:
12425+
case clang::X86::BI__builtin_ia32_phaddsw256: {
12426+
APSInt Res(LHSA.sadd_sat(LHSB));
12427+
ResultElements.push_back(APValue(Res));
12428+
break;
12429+
}
12430+
case clang::X86::BI__builtin_ia32_phsubw128:
12431+
case clang::X86::BI__builtin_ia32_phsubw256:
12432+
case clang::X86::BI__builtin_ia32_phsubd128:
12433+
case clang::X86::BI__builtin_ia32_phsubd256: {
12434+
APSInt Res(LHSA - LHSB, DestUnsigned);
12435+
ResultElements.push_back(APValue(Res));
12436+
break;
12437+
}
12438+
case clang::X86::BI__builtin_ia32_phsubsw128:
12439+
case clang::X86::BI__builtin_ia32_phsubsw256: {
12440+
APSInt Res(LHSA.ssub_sat(LHSB));
12441+
ResultElements.push_back(APValue(Res));
12442+
break;
12443+
}
12444+
}
12445+
}
12446+
for (unsigned I = 0; I != EltsPerLane; I += 2) {
12447+
APSInt RHSA = SourceRHS.getVectorElt(LaneStart + I).getInt();
12448+
APSInt RHSB = SourceRHS.getVectorElt(LaneStart + I + 1).getInt();
12449+
switch (E->getBuiltinCallee()) {
12450+
case clang::X86::BI__builtin_ia32_phaddw128:
12451+
case clang::X86::BI__builtin_ia32_phaddw256:
12452+
case clang::X86::BI__builtin_ia32_phaddd128:
12453+
case clang::X86::BI__builtin_ia32_phaddd256: {
12454+
APSInt Res(RHSA + RHSB, DestUnsigned);
12455+
ResultElements.push_back(APValue(Res));
12456+
break;
12457+
}
12458+
case clang::X86::BI__builtin_ia32_phaddsw128:
12459+
case clang::X86::BI__builtin_ia32_phaddsw256: {
12460+
APSInt Res(RHSA.sadd_sat(RHSB));
12461+
ResultElements.push_back(APValue(Res));
12462+
break;
12463+
}
12464+
case clang::X86::BI__builtin_ia32_phsubw128:
12465+
case clang::X86::BI__builtin_ia32_phsubw256:
12466+
case clang::X86::BI__builtin_ia32_phsubd128:
12467+
case clang::X86::BI__builtin_ia32_phsubd256: {
12468+
APSInt Res(RHSA - RHSB, DestUnsigned);
12469+
ResultElements.push_back(APValue(Res));
12470+
break;
12471+
}
12472+
case clang::X86::BI__builtin_ia32_phsubsw128:
12473+
case clang::X86::BI__builtin_ia32_phsubsw256: {
12474+
APSInt Res(RHSA.ssub_sat(RHSB));
12475+
ResultElements.push_back(APValue(Res));
12476+
break;
12477+
}
12478+
}
12479+
}
12480+
}
12481+
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12482+
}
12483+
case clang::X86::BI__builtin_ia32_haddpd:
12484+
case clang::X86::BI__builtin_ia32_haddps:
12485+
case clang::X86::BI__builtin_ia32_haddps256:
12486+
case clang::X86::BI__builtin_ia32_haddpd256:
12487+
case clang::X86::BI__builtin_ia32_hsubpd:
12488+
case clang::X86::BI__builtin_ia32_hsubps:
12489+
case clang::X86::BI__builtin_ia32_hsubps256:
12490+
case clang::X86::BI__builtin_ia32_hsubpd256: {
12491+
APValue SourceLHS, SourceRHS;
12492+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
12493+
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
12494+
return false;
12495+
unsigned NumElts = SourceLHS.getVectorLength();
12496+
SmallVector<APValue, 4> ResultElements;
12497+
ResultElements.reserve(NumElts);
12498+
llvm::RoundingMode RM = getActiveRoundingMode(getEvalInfo(), E);
12499+
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
12500+
unsigned EltBits = Info.Ctx.getTypeSize(DestEltTy);
12501+
unsigned NumLanes = NumElts * EltBits / 128;
12502+
unsigned NumElemsPerLane = NumElts / NumLanes;
12503+
unsigned HalfElemsPerLane = NumElemsPerLane / 2;
12504+
12505+
for (unsigned L = 0; L != NumElts; L += NumElemsPerLane) {
12506+
for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
12507+
APFloat LHSA = SourceLHS.getVectorElt(L + (2 * I) + 0).getFloat();
12508+
APFloat LHSB = SourceLHS.getVectorElt(L + (2 * I) + 1).getFloat();
12509+
switch (E->getBuiltinCallee()) {
12510+
case clang::X86::BI__builtin_ia32_haddpd:
12511+
case clang::X86::BI__builtin_ia32_haddps:
12512+
case clang::X86::BI__builtin_ia32_haddps256:
12513+
case clang::X86::BI__builtin_ia32_haddpd256:
12514+
LHSA.add(LHSB, RM);
12515+
break;
12516+
case clang::X86::BI__builtin_ia32_hsubpd:
12517+
case clang::X86::BI__builtin_ia32_hsubps:
12518+
case clang::X86::BI__builtin_ia32_hsubps256:
12519+
case clang::X86::BI__builtin_ia32_hsubpd256:
12520+
LHSA.subtract(LHSB, RM);
12521+
break;
12522+
}
12523+
ResultElements.push_back(APValue(LHSA));
12524+
}
12525+
for (unsigned I = 0; I != HalfElemsPerLane; ++I) {
12526+
APFloat RHSA = SourceRHS.getVectorElt(L + (2 * I) + 0).getFloat();
12527+
APFloat RHSB = SourceRHS.getVectorElt(L + (2 * I) + 1).getFloat();
12528+
switch (E->getBuiltinCallee()) {
12529+
case clang::X86::BI__builtin_ia32_haddpd:
12530+
case clang::X86::BI__builtin_ia32_haddps:
12531+
case clang::X86::BI__builtin_ia32_haddps256:
12532+
case clang::X86::BI__builtin_ia32_haddpd256:
12533+
RHSA.add(RHSB, RM);
12534+
break;
12535+
case clang::X86::BI__builtin_ia32_hsubpd:
12536+
case clang::X86::BI__builtin_ia32_hsubps:
12537+
case clang::X86::BI__builtin_ia32_hsubps256:
12538+
case clang::X86::BI__builtin_ia32_hsubpd256:
12539+
RHSA.subtract(RHSB, RM);
12540+
break;
12541+
}
12542+
ResultElements.push_back(APValue(RHSA));
12543+
}
12544+
}
12545+
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12546+
}
1238412547
case Builtin::BI__builtin_elementwise_fshl:
1238512548
case Builtin::BI__builtin_elementwise_fshr: {
1238612549
APValue SourceHi, SourceLo, SourceShift;

0 commit comments

Comments
 (0)