diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 40ea3cb76bae4..e2bc118e4f1af 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57397,14 +57397,18 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, +static SDValue rebuildGatherScatter(SelectionDAG &DAG, + MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, - SelectionDAG &DAG) { + SDValue Mask = SDValue()) { SDLoc DL(GorS); + if (!Mask.getNode()) + Mask = GorS->getMask(); + if (auto *Gather = dyn_cast(GorS)) { - SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), - Gather->getMask(), Base, Index, Scale } ; + SDValue Ops[] = { + Gather->getChain(), Gather->getPassThru(), Mask, Base, Index, Scale}; return DAG.getMaskedGather(Gather->getVTList(), Gather->getMemoryVT(), DL, Ops, Gather->getMemOperand(), @@ -57412,8 +57416,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, Gather->getExtensionType()); } auto *Scatter = cast(GorS); - SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), - Scatter->getMask(), Base, Index, Scale }; + SDValue Ops[] = { + Scatter->getChain(), Scatter->getValue(), Mask, Base, Index, Scale}; return DAG.getMaskedScatter(Scatter->getVTList(), Scatter->getMemoryVT(), DL, Ops, Scatter->getMemOperand(), @@ -57422,7 +57426,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, } static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); auto *GorS = cast(N); SDValue Index = GorS->getIndex(); @@ -57460,7 +57465,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Index.getOperand(0), NewShAmt); SDValue NewScale = DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType()); - return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG); + return rebuildGatherScatter(DAG, GorS, NewIndex, Base, NewScale); } } } @@ -57478,7 +57483,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, // a split. if (SDValue TruncIndex = DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index)) - return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, TruncIndex, Base, Scale); // Shrink any sign/zero extends from 32 or smaller to larger than 32 if // there are sufficient sign bits. Only do this before legalize types to @@ -57487,13 +57492,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, Index.getOpcode() == ISD::ZERO_EXTEND) && Index.getOperand(0).getScalarValueSizeInBits() <= 32) { Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, Index, Base, Scale); } // Shrink if we remove an illegal type. if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) { Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, Index, Base, Scale); } } } @@ -57518,13 +57523,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, DAG.getConstant(Adder, DL, PtrVT)); SDValue NewIndex = Index.getOperand(1 - I); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale); } // For non-constant cases, limit this to non-scaled cases. if (ScaleAmt == 1) { SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat); SDValue NewIndex = Index.getOperand(1 - I); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale); } } } @@ -57539,7 +57544,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT, Index.getOperand(1 - I), Splat); SDValue NewBase = DAG.getConstant(0, DL, PtrVT); - return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale); } } } @@ -57550,12 +57555,67 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; IndexVT = IndexVT.changeVectorElementType(*DAG.getContext(), EltVT); Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); - return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); + return rebuildGatherScatter(DAG, GorS, Index, Base, Scale); } } // With vector masks we only demand the upper bit of the mask. SDValue Mask = GorS->getMask(); + + // When the target does not have avx512 (which has special mask registers), + // replace a mask that looks like: + // + // t9: v4i1 = bitcast t8 + // + // With one that looks like: + // + // t25: i32 = zero_extend t8 + // t26: v4i32 = X86ISD::VBROADCAST t25 + // t32: v4i32 = and t26, t31 + // t33: v4i32 = X86ISD::PCMPEQ t32, t31 + // + // The t31 vector has the values 1 << 0, 1 << 1, 1 << 2, etc. + // + // The default expansion from an integer to a mask vector generates a lot more + // instructions. + if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512()) { + EVT MaskVT = Mask.getValueType(); + + if (MaskVT.isVector() && MaskVT.getVectorElementType() == MVT::i1 && + Mask.getOpcode() == ISD::BITCAST) { + + SDValue Bits = Mask.getOperand(0); + if (Bits.getValueType().isScalarInteger()) { + unsigned NumElts = MaskVT.getVectorNumElements(); + if (NumElts == 4 || NumElts == 8) { + + EVT ValueVT = N->getValueType(0); + EVT IntMaskVT = ValueVT.changeVectorElementTypeToInteger(); + + MVT MaskVecVT = IntMaskVT.getSimpleVT(); + MVT MaskEltVT = MaskVecVT.getVectorElementType(); + + SDValue BitsElt = DAG.getZExtOrTrunc(Bits, DL, MaskEltVT); + SDValue Bc = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVecVT, BitsElt); + + SmallVector Lanes; + Lanes.reserve(NumElts); + for (unsigned i = 0; i < NumElts; ++i) { + uint64_t Bit = 1ull << i; + Lanes.push_back(DAG.getConstant(Bit, DL, MaskEltVT)); + } + + SDValue LaneBits = DAG.getBuildVector(MaskVecVT, DL, Lanes); + SDValue And = DAG.getNode(ISD::AND, DL, MaskVecVT, Bc, LaneBits); + SDValue NewMask = + DAG.getNode(X86ISD::PCMPEQ, DL, MaskVecVT, And, LaneBits); + + return rebuildGatherScatter(DAG, GorS, Index, Base, Scale, NewMask); + } + } + } + } + if (Mask.getScalarValueSizeInBits() != 1) { APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { @@ -61700,7 +61760,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MGATHER: case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI); case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); case X86ISD::PMULDQ: diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll new file mode 100644 index 0000000000000..016137ed7cc86 --- /dev/null +++ b/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll @@ -0,0 +1,600 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -mattr=+avx2 -mcpu=skylake < %s | FileCheck %s --check-prefix=AVX2 + +define <4 x i32> @gather_avx_dd_128(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_dd_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps %xmm0, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: movl $4, %esi +; AVX2-NEXT: jmp llvm.x86.avx2.gather.d.d.128@PLT # TAILCALL + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %m32 = sext <4 x i1> %m to <4 x i32> + %res = tail call <4 x i32> @llvm.x86.avx2.gather.d.d.128(<4 x i32> zeroinitializer, ptr %data, <4 x i32> %indices, <4 x i32> %m32, i8 4) + ret <4 x i32> %res +} + +define <4 x i32> @gather_portable_dd_128(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_dd_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64 + %res = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +define <8 x i32> @gather_avx_dd_256(<8 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_dd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpgatherdd %ymm2, (%rsi,%ymm0,4), %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-NEXT: retq + %m = bitcast i8 %maskbits to <8 x i1> + %m32 = sext <8 x i1> %m to <8 x i32> + %res = tail call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> zeroinitializer, ptr %data, <8 x i32> %indices, <8 x i32> %m32, i8 4) + ret <8 x i32> %res +} + +define <8 x i32> @gather_portable_dd_256(<8 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_dd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpgatherqd %xmm3, (%rsi,%ymm0,4), %xmm5 +; AVX2-NEXT: vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm4 +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm0 +; AVX2-NEXT: retq + %m = bitcast i8 %maskbits to <8 x i1> + %idx64 = zext <8 x i32> %indices to <8 x i64> + %ptrs = getelementptr i32, ptr %data, <8 x i64> %idx64 + %res = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> zeroinitializer) + ret <8 x i32> %res +} + +define <2 x i32> @gather_avx_qd_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_qd_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,2,1,2] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: movl $4, %esi +; AVX2-NEXT: jmp llvm.x86.avx2.gather.q.d.128@PLT # TAILCALL + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %m32 = sext <2 x i1> %m to <2 x i32> + %res = tail call <2 x i32> @llvm.x86.avx2.gather.q.d.128(<2 x i32> zeroinitializer, ptr %data, <2 x i64> %idx64, <2 x i32> %m32, i8 4) + ret <2 x i32> %res +} + +define <2 x i32> @gather_portable_qd_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_qd_128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: andb $2, %al +; AVX2-NEXT: shrb %al +; AVX2-NEXT: andb $1, %dil +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqd %xmm1, (%rsi,%xmm2,4), %xmm0 +; AVX2-NEXT: retq + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %ptrs = getelementptr i32, ptr %data, <2 x i64> %idx64 + %res = tail call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i32> zeroinitializer) + ret <2 x i32> %res +} + +define <4 x i32> @gather_avx_qd_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_qd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %m32 = sext <4 x i1> %m to <4 x i32> + %res = tail call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> zeroinitializer, ptr %data, <4 x i64> %idx64, <4 x i32> %m32, i8 4) + ret <4 x i32> %res +} + +define <4 x i32> @gather_portable_qd_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_qd_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64 + %res = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +define <2 x i64> @gather_avx_dq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_dq_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps %xmm0, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: movl $8, %esi +; AVX2-NEXT: jmp llvm.x86.avx2.gather.d.q.128@PLT # TAILCALL + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %m64 = sext <2 x i1> %m to <2 x i64> + %res = tail call <2 x i64> @llvm.x86.avx2.gather.d.q.128(<2 x i64> zeroinitializer, ptr %data, <2 x i32> %indices, <2 x i64> %m64, i8 8) + ret <2 x i64> %res +} + +define <2 x i64> @gather_portable_dq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_dq_128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negq %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: andb $2, %dil +; AVX2-NEXT: shrb %dil +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: negq %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqq %xmm1, (%rsi,%xmm2,8), %xmm0 +; AVX2-NEXT: retq + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64 + %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %m, <2 x i64> zeroinitializer) + ret <2 x i64> %res +} + +define <4 x i64> @gather_avx_dq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_dq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpgatherdq %ymm2, (%rsi,%xmm0,8), %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %m64 = sext <4 x i1> %m to <4 x i64> + %res = tail call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> zeroinitializer, ptr %data, <4 x i32> %indices, <4 x i64> %m64, i8 8) + ret <4 x i64> %res +} + +define <4 x i64> @gather_portable_dq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_dq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0 +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64 + %res = tail call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %m, <4 x i64> zeroinitializer) + ret <4 x i64> %res +} + +define <2 x i64> @gather_avx_qq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_qq_128: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq %rsi, %rdi +; AVX2-NEXT: movl $8, %esi +; AVX2-NEXT: jmp llvm.x86.avx2.gather.q.q.128@PLT # TAILCALL + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %m64 = sext <2 x i1> %m to <2 x i64> + %res = tail call <2 x i64> @llvm.x86.avx2.gather.q.q.128(<2 x i64> zeroinitializer, ptr %data, <2 x i64> %idx64, <2 x i64> %m64, i8 8) + ret <2 x i64> %res +} + +define <2 x i64> @gather_portable_qq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_qq_128: +; AVX2: # %bb.0: +; AVX2-NEXT: movl %edi, %eax +; AVX2-NEXT: andl $1, %eax +; AVX2-NEXT: negq %rax +; AVX2-NEXT: vmovq %rax, %xmm1 +; AVX2-NEXT: andb $2, %dil +; AVX2-NEXT: shrb %dil +; AVX2-NEXT: movzbl %dil, %eax +; AVX2-NEXT: negq %rax +; AVX2-NEXT: vmovq %rax, %xmm2 +; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqq %xmm1, (%rsi,%xmm2,8), %xmm0 +; AVX2-NEXT: retq + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64 + %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %m, <2 x i64> zeroinitializer) + ret <2 x i64> %res +} + +define <4 x i64> @gather_avx_qq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_avx_qq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0 +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %m64 = sext <4 x i1> %m to <4 x i64> + %res = tail call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> zeroinitializer, ptr %data, <4 x i64> %idx64, <4 x i64> %m64, i8 8) + ret <4 x i64> %res +} + +define <4 x i64> @gather_portable_qq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind { +; AVX2-LABEL: gather_portable_qq_256: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0 +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64 + %res = tail call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %m, <4 x i64> zeroinitializer) + ret <4 x i64> %res +} + +declare <4 x i32> @llvm.x86.avx2.gather.d.d.128(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8) +declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8) + +declare <2 x i32> @llvm.x86.avx2.gather.q.d.128(<2 x i32>, ptr, <2 x i64>, <2 x i32>, i8) +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8) + +declare <2 x i64> @llvm.x86.avx2.gather.d.q.128(<2 x i64>, ptr, <2 x i32>, <2 x i64>, i8) +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8) + +declare <2 x i64> @llvm.x86.avx2.gather.q.q.128(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8) +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8) + +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>) + +declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x i64>) + +define void @scatter_portable_i32_4(<4 x i32> %values, <4 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind { +; AVX2-LABEL: scatter_portable_i32_4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsllq $2, %ymm1, %ymm1 +; AVX2-NEXT: vmovq %rsi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: jne .LBB16_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: jne .LBB16_3 +; AVX2-NEXT: .LBB16_4: # %else2 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: jne .LBB16_5 +; AVX2-NEXT: .LBB16_6: # %else4 +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: jne .LBB16_7 +; AVX2-NEXT: .LBB16_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB16_1: # %cond.store +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovss %xmm0, (%rax) +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: je .LBB16_4 +; AVX2-NEXT: .LBB16_3: # %cond.store1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: je .LBB16_6 +; AVX2-NEXT: .LBB16_5: # %cond.store3 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: je .LBB16_8 +; AVX2-NEXT: .LBB16_7: # %cond.store5 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextractps $3, %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64 + tail call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %values, <4 x ptr> %ptrs, i32 4, <4 x i1> %m) + ret void +} + +define void @scatter_portable_i32_8(<8 x i32> %values, <8 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind { +; AVX2-LABEL: scatter_portable_i32_8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vmovq %rsi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpsllq $2, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: jne .LBB17_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: jne .LBB17_3 +; AVX2-NEXT: .LBB17_4: # %else2 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: jne .LBB17_5 +; AVX2-NEXT: .LBB17_6: # %else4 +; AVX2-NEXT: vpsllq $2, %ymm4, %ymm3 +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: jne .LBB17_7 +; AVX2-NEXT: .LBB17_8: # %else6 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $16, %dil +; AVX2-NEXT: jne .LBB17_9 +; AVX2-NEXT: .LBB17_10: # %else8 +; AVX2-NEXT: testb $32, %dil +; AVX2-NEXT: jne .LBB17_11 +; AVX2-NEXT: .LBB17_12: # %else10 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $64, %dil +; AVX2-NEXT: jne .LBB17_13 +; AVX2-NEXT: .LBB17_14: # %else12 +; AVX2-NEXT: testb $-128, %dil +; AVX2-NEXT: jne .LBB17_15 +; AVX2-NEXT: .LBB17_16: # %else14 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB17_1: # %cond.store +; AVX2-NEXT: vmovq %xmm3, %rax +; AVX2-NEXT: vmovss %xmm0, (%rax) +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: je .LBB17_4 +; AVX2-NEXT: .LBB17_3: # %cond.store1 +; AVX2-NEXT: vpextrq $1, %xmm3, %rax +; AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: je .LBB17_6 +; AVX2-NEXT: .LBB17_5: # %cond.store3 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; AVX2-NEXT: vpsllq $2, %ymm4, %ymm3 +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: je .LBB17_8 +; AVX2-NEXT: .LBB17_7: # %cond.store5 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextractps $3, %xmm0, (%rax) +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX2-NEXT: testb $16, %dil +; AVX2-NEXT: je .LBB17_10 +; AVX2-NEXT: .LBB17_9: # %cond.store7 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovss %xmm0, (%rax) +; AVX2-NEXT: testb $32, %dil +; AVX2-NEXT: je .LBB17_12 +; AVX2-NEXT: .LBB17_11: # %cond.store9 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextractps $1, %xmm0, (%rax) +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $64, %dil +; AVX2-NEXT: je .LBB17_14 +; AVX2-NEXT: .LBB17_13: # %cond.store11 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vextractps $2, %xmm0, (%rax) +; AVX2-NEXT: testb $-128, %dil +; AVX2-NEXT: je .LBB17_16 +; AVX2-NEXT: .LBB17_15: # %cond.store13 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vextractps $3, %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m = bitcast i8 %maskbits to <8 x i1> + %idx64 = zext <8 x i32> %indices to <8 x i64> + %ptrs = getelementptr i32, ptr %data, <8 x i64> %idx64 + tail call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %values, <8 x ptr> %ptrs, i32 4, <8 x i1> %m) + ret void +} + +define void @scatter_portable_i64_2(<2 x i64> %values, <2 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind { +; AVX2-LABEL: scatter_portable_i64_2: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 +; AVX2-NEXT: vmovq %rsi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: jne .LBB18_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: jne .LBB18_3 +; AVX2-NEXT: .LBB18_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB18_1: # %cond.store +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: je .LBB18_4 +; AVX2-NEXT: .LBB18_3: # %cond.store1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, (%rax) +; AVX2-NEXT: retq + %m2 = trunc i8 %maskbits to i2 + %m = bitcast i2 %m2 to <2 x i1> + %idx64 = zext <2 x i32> %indices to <2 x i64> + %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64 + tail call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %values, <2 x ptr> %ptrs, i32 8, <2 x i1> %m) + ret void +} + +define void @scatter_portable_i64_4(<4 x i64> %values, <4 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind { +; AVX2-LABEL: scatter_portable_i64_4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpsllq $3, %ymm1, %ymm1 +; AVX2-NEXT: vmovq %rsi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: testb $1, %dil +; AVX2-NEXT: jne .LBB19_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: jne .LBB19_3 +; AVX2-NEXT: .LBB19_4: # %else2 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: jne .LBB19_5 +; AVX2-NEXT: .LBB19_6: # %else4 +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: jne .LBB19_7 +; AVX2-NEXT: .LBB19_8: # %else6 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB19_1: # %cond.store +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: testb $2, %dil +; AVX2-NEXT: je .LBB19_4 +; AVX2-NEXT: .LBB19_3: # %cond.store1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, (%rax) +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-NEXT: testb $4, %dil +; AVX2-NEXT: je .LBB19_6 +; AVX2-NEXT: .LBB19_5: # %cond.store3 +; AVX2-NEXT: vmovq %xmm1, %rax +; AVX2-NEXT: vmovq %xmm0, (%rax) +; AVX2-NEXT: testb $8, %dil +; AVX2-NEXT: je .LBB19_8 +; AVX2-NEXT: .LBB19_7: # %cond.store5 +; AVX2-NEXT: vpextrq $1, %xmm1, %rax +; AVX2-NEXT: vpextrq $1, %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq + %m4 = trunc i8 %maskbits to i4 + %m = bitcast i4 %m4 to <4 x i1> + %idx64 = zext <4 x i32> %indices to <4 x i64> + %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64 + tail call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %values, <4 x ptr> %ptrs, i32 8, <4 x i1> %m) + ret void +} + + +declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32 immarg, <4 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>) +declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32 immarg, <2 x i1>) +declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32 immarg, <4 x i1>)