Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a81c406
deal this issues 155395
Sep 4, 2025
2fadf3f
deal issues 15595
Sep 4, 2025
f8362b4
Merge branch 'llvm:main' into main
whytolearn Sep 4, 2025
ed4a09f
constexpr deal
Sep 11, 2025
df6242e
adjust unit test #146940
Sep 13, 2025
9f2fb43
Merge remote-tracking branch 'upstream/main'
Sep 13, 2025
929d7c0
Merge branch 'main' into main
whytolearn Sep 13, 2025
f91aa21
adjust test case and function
Sep 26, 2025
4f5fb87
undo the unintentional formatting of the code
Sep 26, 2025
2422cd4
Merge branch 'main' into main
whytolearn Sep 26, 2025
a3575c5
Merge branch 'main' into main
whytolearn Sep 26, 2025
b2cac3e
adjust code
Sep 26, 2025
197123a
adjust code for mm256
Sep 28, 2025
b733157
format code
Sep 28, 2025
1ce4883
Merge branch 'main' into main
whytolearn Sep 29, 2025
9a7c138
deal all 256 double pane ins
Oct 2, 2025
a65f4fc
deal all 256 double pane ins
Oct 2, 2025
9877317
adjust for 128 and 256 oprand
Oct 7, 2025
404d261
Merge branch 'main' into main
whytolearn Oct 7, 2025
1d61bf2
undo some bad format for .td file
Oct 7, 2025
b25aa5e
Merge branch 'main' into main
whytolearn Oct 9, 2025
4bc2341
merge disperse operation
Oct 10, 2025
242165a
Merge remote-tracking branch 'upstream/main'
Oct 10, 2025
d2e5d43
Merge remote-tracking branch 'upstream/main'
Oct 10, 2025
6d57df0
Merge branch 'main' into main
whytolearn Oct 11, 2025
03e4db0
Merge branch 'main' into main
RKSimon Oct 13, 2025
c2117f6
Update clang/lib/AST/ByteCode/InterpBuiltin.cpp
whytolearn Oct 13, 2025
5c7412f
Update clang/lib/AST/ByteCode/InterpBuiltin.cpp
whytolearn Oct 13, 2025
90200be
Merge branch 'main' into main
whytolearn Oct 13, 2025
5df6aff
Update clang/lib/AST/ExprConstant.cpp
whytolearn Oct 13, 2025
202c165
bad merger delate and code format
Oct 13, 2025
34ee8ed
Merge branch 'main' into main
whytolearn Oct 13, 2025
9ec2672
Merge branch 'main' into main
whytolearn Oct 13, 2025
7e15580
Merge branch 'main' into main
RKSimon Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2742,6 +2742,86 @@ static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp_builtin_horizontal_int_binop(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APInt(const APSInt &, const APSInt &)> Fn) {
assert(Call->getNumArgs() == 2);

assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
assert(VT->getElementType()->isIntegralOrEnumerationType());
PrimType ElemT = *S.getContext().classify(VT->getElementType());
bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType();

const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

unsigned SourceLen = VT->getNumElements();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  unsigned NumElts= VT->getNumElements();
  unsigned EltBits = ASTCtx.getIntWidth(VT->getElementType());
  unsigned EltsPerLane = 128 / SrcBits;
  unsigned Lanes = NumElts * EltBits / 128;

assert(SourceLen % 2 == 0 &&
Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
SourceLen);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't waste your time with asserts like this - the defs in BuiltsX86.td mean that the types should be correct, else Sema will catch it.

unsigned DstElem = 0;

for (unsigned I = 0; I != SourceLen; I += 2) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (unsigned Lane = 0; Lane != NumElts; Lane += EltsPerLane) {
  for (unsigned I = 0; I != EltsPerLane; I += 2) {
    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
      APSInt Elem1 = LHS.elem<T>(Lane + I).toAPSInt();
      APSInt Elem2 = LHS.elem<T>(Lane + I + 1).toAPSInt();
      Dst.elem<T>(Lane + I) = static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
    });
  }
  for (unsigned I = 0; I != SrcPerLane; I += 2) {
    INT_TYPE_SWITCH_NO_BOOL(ElemT, {
      APSInt Elem1 = RLHS.elem<T>(Lane + I).toAPSInt();
      APSInt Elem2 = RHS.elem<T>(Lane + I + 1).toAPSInt();
      Dst.elem<T>(Lane + EltsPerLane + I) = static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
    });
  }
}

INT_TYPE_SWITCH_NO_BOOL(ElemT, {
APSInt Elem1 = LHS.elem<T>(I).toAPSInt();
APSInt Elem2 = LHS.elem<T>(I + 1).toAPSInt();
Dst.elem<T>(DstElem) =
static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
});
++DstElem;
}
for (unsigned I = 0; I != SourceLen; I += 2) {
INT_TYPE_SWITCH_NO_BOOL(ElemT, {
APSInt Elem1 = RHS.elem<T>(I).toAPSInt();
APSInt Elem2 = RHS.elem<T>(I + 1).toAPSInt();
Dst.elem<T>(DstElem) =
static_cast<T>(APSInt(Fn(Elem1, Elem2), DestUnsigned));
});
++DstElem;
}
Dst.initializeAllElements();
return true;
}

static bool interp_builtin_horizontal_fp_binop(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
llvm::RoundingMode)>
Fn) {
assert(Call->getNumArgs() == 2);
assert(Call->getArg(0)->getType()->isVectorType() &&
Call->getArg(1)->getType()->isVectorType());
const Pointer &RHS = S.Stk.pop<Pointer>();
const Pointer &LHS = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

FPOptions FPO = Call->getFPFeaturesInEffect(S.Ctx.getLangOpts());
llvm::RoundingMode RM = getRoundingMode(FPO);
const auto *VT = Call->getArg(0)->getType()->castAs<VectorType>();
unsigned SourceLen = VT->getNumElements();
assert(SourceLen % 2 == 0 &&
Call->getArg(1)->getType()->castAs<VectorType>()->getNumElements() ==
SourceLen);
unsigned DstElem = 0;
for (unsigned I = 0; I != SourceLen; I += 2) {
using T = PrimConv<PT_Float>::T;
APFloat Elem1 = LHS.elem<T>(I).getAPFloat();
APFloat Elem2 = LHS.elem<T>(I + 1).getAPFloat();
Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the extra APFloat() here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same with the integer cases above.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have do some reference func "interp__builtin_elementwise_int_binop" to deal horizontal add/sub。 So APInt APFloat like same level class。 and when deal flaot, i must consider rounding off mode, So like func "interp__builtin_elementwise_triop_fp" and func "interp__builtin_elementwise_triop", i split it into 2 case

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fn already returns an APFloat, doesn't it? Why the extra constructor call here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you remind, i have adjust the code

}
for (unsigned I = 0; I != SourceLen; I += 2) {
using T = PrimConv<PT_Float>::T;
APFloat Elem1 = RHS.elem<T>(I).getAPFloat();
APFloat Elem2 = RHS.elem<T>(I + 1).getAPFloat();
Dst.elem<T>(DstElem++) = static_cast<T>(APFloat(Fn(Elem1, Elem2, RM)));
}
Dst.initializeAllElements();
return true;
}

static bool interp__builtin_elementwise_triop_fp(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APFloat(const APFloat &, const APFloat &,
Expand Down Expand Up @@ -3453,6 +3533,55 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case Builtin::BI__builtin_elementwise_min:
return interp__builtin_elementwise_maxmin(S, OpPC, Call, BuiltinID);

case clang::X86::BI__builtin_ia32_phaddw128:
case clang::X86::BI__builtin_ia32_phaddw256:
case clang::X86::BI__builtin_ia32_phaddd128:
case clang::X86::BI__builtin_ia32_phaddd256:
return interp_builtin_horizontal_int_binop(
S, OpPC, Call,
[](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; });
case clang::X86::BI__builtin_ia32_phaddsw128:
case clang::X86::BI__builtin_ia32_phaddsw256:
return interp_builtin_horizontal_int_binop(
S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
return LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS);
});
case clang::X86::BI__builtin_ia32_phsubw128:
case clang::X86::BI__builtin_ia32_phsubw256:
case clang::X86::BI__builtin_ia32_phsubd128:
case clang::X86::BI__builtin_ia32_phsubd256:
return interp_builtin_horizontal_int_binop(
S, OpPC, Call,
[](const APSInt &LHS, const APSInt &RHS) { return LHS - RHS; });
case clang::X86::BI__builtin_ia32_phsubsw128:
case clang::X86::BI__builtin_ia32_phsubsw256:
return interp_builtin_horizontal_int_binop(
S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
});
case clang::X86::BI__builtin_ia32_haddpd:
case clang::X86::BI__builtin_ia32_haddpd256:
case clang::X86::BI__builtin_ia32_haddps:
case clang::X86::BI__builtin_ia32_haddps256:
return interp_builtin_horizontal_fp_binop(
S, OpPC, Call,
[](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
APFloat F = LHS;
F.add(RHS, RM);
return F;
});
case clang::X86::BI__builtin_ia32_hsubpd:
case clang::X86::BI__builtin_ia32_hsubpd256:
case clang::X86::BI__builtin_ia32_hsubps:
case clang::X86::BI__builtin_ia32_hsubps256:
return interp_builtin_horizontal_fp_binop(
S, OpPC, Call,
[](const APFloat &LHS, const APFloat &RHS, llvm::RoundingMode RM) {
APFloat F = LHS;
F.subtract(RHS, RM);
return F;
});

case clang::X86::BI__builtin_ia32_pmuldq128:
case clang::X86::BI__builtin_ia32_pmuldq256:
case clang::X86::BI__builtin_ia32_pmuldq512:
Expand Down
140 changes: 140 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetInfo.h"
#include "llvm/ADT/APFixedPoint.h"
#include "llvm/ADT/APInt.h"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unnecessary

#include "llvm/ADT/Sequence.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/StringExtras.h"
Expand Down Expand Up @@ -12067,6 +12068,145 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case clang::X86::BI__builtin_ia32_phaddw128:
case clang::X86::BI__builtin_ia32_phaddw256:
case clang::X86::BI__builtin_ia32_phaddd128:
case clang::X86::BI__builtin_ia32_phaddd256:
case clang::X86::BI__builtin_ia32_phaddsw128:
case clang::X86::BI__builtin_ia32_phaddsw256:

case clang::X86::BI__builtin_ia32_phsubw128:
case clang::X86::BI__builtin_ia32_phsubw256:
case clang::X86::BI__builtin_ia32_phsubd128:
case clang::X86::BI__builtin_ia32_phsubd256:
case clang::X86::BI__builtin_ia32_phsubsw128:
case clang::X86::BI__builtin_ia32_phsubsw256:{
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;
QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();

unsigned SourceLen = SourceLHS.getVectorLength();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen);
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APSInt LHSA = SourceLHS.getVectorElt(EltNum).getInt();
APSInt LHSB = SourceLHS.getVectorElt(EltNum + 1).getInt();

switch (E->getBuiltinCallee()) {
case clang::X86::BI__builtin_ia32_phaddw128:
case clang::X86::BI__builtin_ia32_phaddw256:
case clang::X86::BI__builtin_ia32_phaddd128:
case clang::X86::BI__builtin_ia32_phaddd256:
ResultElements.push_back(
APValue(APSInt(LHSA+LHSB, DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phaddsw128:
case clang::X86::BI__builtin_ia32_phaddsw256:
ResultElements.push_back(APValue(APSInt(
LHSA.isSigned() ? LHSA.sadd_sat(LHSB) : LHSA.uadd_sat(LHSB),
DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phsubw128:
case clang::X86::BI__builtin_ia32_phsubw256:
case clang::X86::BI__builtin_ia32_phsubd128:
case clang::X86::BI__builtin_ia32_phsubd256:
ResultElements.push_back(APValue(APSInt(LHSA - LHSB, DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phsubsw128:
case clang::X86::BI__builtin_ia32_phsubsw256:
ResultElements.push_back(APValue(APSInt(
LHSA.isSigned() ? LHSA.ssub_sat(LHSB) : LHSA.usub_sat(LHSB),
DestUnsigned)));
break;
}
}
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APSInt RHSA = SourceRHS.getVectorElt(EltNum).getInt();
APSInt RHSB = SourceRHS.getVectorElt(EltNum + 1).getInt();

switch (E->getBuiltinCallee()) {
case clang::X86::BI__builtin_ia32_phaddw128:
case clang::X86::BI__builtin_ia32_phaddw256:
case clang::X86::BI__builtin_ia32_phaddd128:
case clang::X86::BI__builtin_ia32_phaddd256:
ResultElements.push_back(APValue(APSInt(RHSA + RHSB, DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phaddsw128:
case clang::X86::BI__builtin_ia32_phaddsw256:
ResultElements.push_back(APValue(
APSInt(RHSA.isSigned() ? RHSA.sadd_sat(RHSB) : RHSA.uadd_sat(RHSB),
DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phsubw128:
case clang::X86::BI__builtin_ia32_phsubw256:
case clang::X86::BI__builtin_ia32_phsubd128:
case clang::X86::BI__builtin_ia32_phsubd256:
ResultElements.push_back(APValue(APSInt(RHSA - RHSB, DestUnsigned)));
break;
case clang::X86::BI__builtin_ia32_phsubsw128:
case clang::X86::BI__builtin_ia32_phsubsw256:
ResultElements.push_back(APValue(
APSInt(RHSA.isSigned() ? RHSA.ssub_sat(RHSB) : RHSA.usub_sat(RHSB),
DestUnsigned)));
break;
}
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case clang::X86::BI__builtin_ia32_haddpd:
case clang::X86::BI__builtin_ia32_haddpd256:
case clang::X86::BI__builtin_ia32_haddps:
case clang::X86::BI__builtin_ia32_haddps256: {
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;
unsigned SourceLen = SourceLHS.getVectorLength();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen);
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
LHSA.add(LHSB, APFloat::rmNearestTiesToEven);
ResultElements.push_back(APValue(LHSA));
}
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
RHSA.add(RHSB, APFloat::rmNearestTiesToEven);
ResultElements.push_back(APValue(RHSA));
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
case clang::X86::BI__builtin_ia32_hsubpd:
case clang::X86::BI__builtin_ia32_hsubpd256:
case clang::X86::BI__builtin_ia32_hsubps:
case clang::X86::BI__builtin_ia32_hsubps256: {
APValue SourceLHS, SourceRHS;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
return false;
unsigned SourceLen = SourceLHS.getVectorLength();
SmallVector<APValue, 4> ResultElements;
ResultElements.reserve(SourceLen);
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APFloat LHSA = SourceLHS.getVectorElt(EltNum).getFloat();
APFloat LHSB = SourceLHS.getVectorElt(EltNum + 1).getFloat();
LHSA.subtract(LHSB, APFloat::rmNearestTiesToEven);
ResultElements.push_back(APValue(LHSA));
}
for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) {
APFloat RHSA = SourceRHS.getVectorElt(EltNum).getFloat();
APFloat RHSB = SourceRHS.getVectorElt(EltNum + 1).getFloat();
RHSA.subtract(RHSB, APFloat::rmNearestTiesToEven);
ResultElements.push_back(APValue(RHSA));
}
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case Builtin::BI__builtin_elementwise_fshl:
case Builtin::BI__builtin_elementwise_fshr: {
APValue SourceHi, SourceLo, SourceShift;
Expand Down
42 changes: 18 additions & 24 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -842,10 +842,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the sums.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadd_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hadd_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
}

/// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
Expand Down Expand Up @@ -874,10 +873,9 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [8 x i32] containing one of the source operands.
/// \returns A 256-bit vector of [8 x i32] containing the sums.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadd_epi32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hadd_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
}

/// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
Expand Down Expand Up @@ -909,10 +907,9 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the sums.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hadds_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hadds_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
}

/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
Expand Down Expand Up @@ -945,10 +942,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the differences.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsub_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hsub_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
}

/// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
Expand Down Expand Up @@ -977,10 +973,9 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [8 x i32] containing one of the source operands.
/// \returns A 256-bit vector of [8 x i32] containing the differences.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsub_epi32(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hsub_epi32(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
}

/// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
Expand Down Expand Up @@ -1013,10 +1008,9 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
/// \param __b
/// A 256-bit vector of [16 x i16] containing one of the source operands.
/// \returns A 256-bit vector of [16 x i16] containing the differences.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_hsubs_epi16(__m256i __a, __m256i __b)
{
return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
}

/// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
Expand Down
Loading