diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 40ea3cb76bae4..e2bc118e4f1af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -57397,14 +57397,18 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
+static SDValue rebuildGatherScatter(SelectionDAG &DAG,
+                                    MaskedGatherScatterSDNode *GorS,
                                     SDValue Index, SDValue Base, SDValue Scale,
-                                    SelectionDAG &DAG) {
+                                    SDValue Mask = SDValue()) {
   SDLoc DL(GorS);
 
+  if (!Mask.getNode())
+    Mask = GorS->getMask();
+
   if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
-                      Gather->getMask(), Base, Index, Scale } ;
+    SDValue Ops[] = {
+        Gather->getChain(), Gather->getPassThru(), Mask, Base, Index, Scale};
     return DAG.getMaskedGather(Gather->getVTList(),
                                Gather->getMemoryVT(), DL, Ops,
                                Gather->getMemOperand(),
@@ -57412,8 +57416,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
                                Gather->getExtensionType());
   }
   auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
-                    Scatter->getMask(), Base, Index, Scale };
+  SDValue Ops[] = {
+      Scatter->getChain(), Scatter->getValue(), Mask, Base, Index, Scale};
   return DAG.getMaskedScatter(Scatter->getVTList(),
                               Scatter->getMemoryVT(), DL,
                               Ops, Scatter->getMemOperand(),
@@ -57422,7 +57426,8 @@ static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
 }
 
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
   SDValue Index = GorS->getIndex();
@@ -57460,7 +57465,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
                                          Index.getOperand(0), NewShAmt);
           SDValue NewScale =
               DAG.getConstant(ScaleAmt * 2, DL, Scale.getValueType());
-          return rebuildGatherScatter(GorS, NewIndex, Base, NewScale, DAG);
+          return rebuildGatherScatter(DAG, GorS, NewIndex, Base, NewScale);
         }
       }
     }
@@ -57478,7 +57483,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       // a split.
       if (SDValue TruncIndex =
               DAG.FoldConstantArithmetic(ISD::TRUNCATE, DL, NewVT, Index))
-        return rebuildGatherScatter(GorS, TruncIndex, Base, Scale, DAG);
+        return rebuildGatherScatter(DAG, GorS, TruncIndex, Base, Scale);
 
       // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
       // there are sufficient sign bits. Only do this before legalize types to
@@ -57487,13 +57492,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
            Index.getOpcode() == ISD::ZERO_EXTEND) &&
           Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
-        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+        return rebuildGatherScatter(DAG, GorS, Index, Base, Scale);
       }
 
       // Shrink if we remove an illegal type.
       if (!TLI.isTypeLegal(Index.getValueType()) && TLI.isTypeLegal(NewVT)) {
         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
-        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+        return rebuildGatherScatter(DAG, GorS, Index, Base, Scale);
       }
     }
   }
@@ -57518,13 +57523,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
               SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
                                             DAG.getConstant(Adder, DL, PtrVT));
               SDValue NewIndex = Index.getOperand(1 - I);
-              return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
+              return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale);
             }
             // For non-constant cases, limit this to non-scaled cases.
             if (ScaleAmt == 1) {
               SDValue NewBase = DAG.getNode(ISD::ADD, DL, PtrVT, Base, Splat);
               SDValue NewIndex = Index.getOperand(1 - I);
-              return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
+              return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale);
             }
           }
         }
@@ -57539,7 +57544,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
           SDValue NewIndex = DAG.getNode(ISD::ADD, DL, IndexVT,
                                          Index.getOperand(1 - I), Splat);
           SDValue NewBase = DAG.getConstant(0, DL, PtrVT);
-          return rebuildGatherScatter(GorS, NewIndex, NewBase, Scale, DAG);
+          return rebuildGatherScatter(DAG, GorS, NewIndex, NewBase, Scale);
         }
       }
   }
@@ -57550,12 +57555,67 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
       IndexVT = IndexVT.changeVectorElementType(*DAG.getContext(), EltVT);
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
-      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
+      return rebuildGatherScatter(DAG, GorS, Index, Base, Scale);
     }
   }
 
   // With vector masks we only demand the upper bit of the mask.
   SDValue Mask = GorS->getMask();
+
+  // When the target does not have avx512 (which has special mask registers),
+  // replace a mask that looks like:
+  //
+  //   t9: v4i1 = bitcast t8
+  //
+  // With one that looks like:
+  //
+  //  t25: i32 = zero_extend t8
+  //  t26: v4i32 = X86ISD::VBROADCAST t25
+  //  t32: v4i32 = and t26, t31
+  //  t33: v4i32 = X86ISD::PCMPEQ t32, t31
+  //
+  // The t31 vector has the values 1 << 0, 1 << 1, 1 << 2, etc.
+  //
+  // The default expansion from an integer to a mask vector generates a lot more
+  // instructions.
+  if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512()) {
+    EVT MaskVT = Mask.getValueType();
+
+    if (MaskVT.isVector() && MaskVT.getVectorElementType() == MVT::i1 &&
+        Mask.getOpcode() == ISD::BITCAST) {
+
+      SDValue Bits = Mask.getOperand(0);
+      if (Bits.getValueType().isScalarInteger()) {
+        unsigned NumElts = MaskVT.getVectorNumElements();
+        if (NumElts == 4 || NumElts == 8) {
+
+          EVT ValueVT = N->getValueType(0);
+          EVT IntMaskVT = ValueVT.changeVectorElementTypeToInteger();
+
+          MVT MaskVecVT = IntMaskVT.getSimpleVT();
+          MVT MaskEltVT = MaskVecVT.getVectorElementType();
+
+          SDValue BitsElt = DAG.getZExtOrTrunc(Bits, DL, MaskEltVT);
+          SDValue Bc = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVecVT, BitsElt);
+
+          SmallVector<SDValue, 8> Lanes;
+          Lanes.reserve(NumElts);
+          for (unsigned i = 0; i < NumElts; ++i) {
+            uint64_t Bit = 1ull << i;
+            Lanes.push_back(DAG.getConstant(Bit, DL, MaskEltVT));
+          }
+
+          SDValue LaneBits = DAG.getBuildVector(MaskVecVT, DL, Lanes);
+          SDValue And = DAG.getNode(ISD::AND, DL, MaskVecVT, Bc, LaneBits);
+          SDValue NewMask =
+              DAG.getNode(X86ISD::PCMPEQ, DL, MaskVecVT, And, LaneBits);
+
+          return rebuildGatherScatter(DAG, GorS, Index, Base, Scale, NewMask);
+        }
+      }
+    }
+  }
+
   if (Mask.getScalarValueSizeInBits() != 1) {
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
     if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
@@ -61700,7 +61760,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::MGATHER:
   case X86ISD::MSCATTER:    return combineX86GatherScatter(N, DAG, DCI);
   case ISD::MGATHER:
-  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI);
+  case ISD::MSCATTER:       return combineGatherScatter(N, DAG, DCI, Subtarget);
   case X86ISD::PCMPEQ:
   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
   case X86ISD::PMULDQ:
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll
new file mode 100644
index 0000000000000..016137ed7cc86
--- /dev/null
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_portable.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=x86_64-unknown-unknown -O3 -mattr=+avx2 -mcpu=skylake < %s | FileCheck %s --check-prefix=AVX2
+
+define <4 x i32> @gather_avx_dd_128(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_dd_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rsi, %rdi
+; AVX2-NEXT:    movl $4, %esi
+; AVX2-NEXT:    jmp llvm.x86.avx2.gather.d.d.128@PLT # TAILCALL
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %m32 = sext <4 x i1> %m to <4 x i32>
+  %res = tail call <4 x i32> @llvm.x86.avx2.gather.d.d.128(<4 x i32> zeroinitializer, ptr %data, <4 x i32> %indices, <4 x i32> %m32, i8 4)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @gather_portable_dd_128(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_dd_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64
+  %res = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x i32> zeroinitializer)
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @gather_avx_dd_256(<8 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_dd_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpgatherdd %ymm2, (%rsi,%ymm0,4), %ymm1
+; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %m  = bitcast i8 %maskbits to <8 x i1>
+  %m32 = sext <8 x i1> %m to <8 x i32>
+  %res = tail call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> zeroinitializer, ptr %data, <8 x i32> %indices, <8 x i32> %m32, i8 4)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @gather_portable_dd_256(<8 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_dd_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm2
+; AVX2-NEXT:    vpbroadcastd %xmm2, %ymm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128]
+; AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX2-NEXT:    vpgatherqd %xmm3, (%rsi,%ymm0,4), %xmm5
+; AVX2-NEXT:    vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm4
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm4, %ymm0
+; AVX2-NEXT:    retq
+  %m  = bitcast i8 %maskbits to <8 x i1>
+  %idx64 = zext <8 x i32> %indices to <8 x i64>
+  %ptrs = getelementptr i32, ptr %data, <8 x i64> %idx64
+  %res = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptrs, i32 4, <8 x i1> %m, <8 x i32> zeroinitializer)
+  ret <8 x i32> %res
+}
+
+define <2 x i32> @gather_avx_qd_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_qd_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,2,1,2]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rsi, %rdi
+; AVX2-NEXT:    movl $4, %esi
+; AVX2-NEXT:    jmp llvm.x86.avx2.gather.q.d.128@PLT # TAILCALL
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %m32 = sext <2 x i1> %m to <2 x i32>
+  %res = tail call <2 x i32> @llvm.x86.avx2.gather.q.d.128(<2 x i32> zeroinitializer, ptr %data, <2 x i64> %idx64, <2 x i32> %m32, i8 4)
+  ret <2 x i32> %res
+}
+
+define <2 x i32> @gather_portable_qd_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_qd_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    andb $2, %al
+; AVX2-NEXT:    shrb %al
+; AVX2-NEXT:    andb $1, %dil
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vpslld $31, %xmm0, %xmm1
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqd %xmm1, (%rsi,%xmm2,4), %xmm0
+; AVX2-NEXT:    retq
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %ptrs = getelementptr i32, ptr %data, <2 x i64> %idx64
+  %res = tail call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptrs, i32 4, <2 x i1> %m, <2 x i32> zeroinitializer)
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @gather_avx_qd_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_qd_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %m32 = sext <4 x i1> %m to <4 x i32>
+  %res = tail call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> zeroinitializer, ptr %data, <4 x i64> %idx64, <4 x i32> %m32, i8 4)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @gather_portable_qd_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_qd_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqd %xmm2, (%rsi,%ymm1,4), %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64
+  %res = tail call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1> %m, <4 x i32> zeroinitializer)
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @gather_avx_dq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_dq_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovaps %xmm0, %xmm1
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rsi, %rdi
+; AVX2-NEXT:    movl $8, %esi
+; AVX2-NEXT:    jmp llvm.x86.avx2.gather.d.q.128@PLT # TAILCALL
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %m64 = sext <2 x i1> %m to <2 x i64>
+  %res = tail call <2 x i64> @llvm.x86.avx2.gather.d.q.128(<2 x i64> zeroinitializer, ptr %data, <2 x i32> %indices, <2 x i64> %m64, i8 8)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @gather_portable_dq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_dq_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    negq %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    andb $2, %dil
+; AVX2-NEXT:    shrb %dil
+; AVX2-NEXT:    movzbl %dil, %eax
+; AVX2-NEXT:    negq %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqq %xmm1, (%rsi,%xmm2,8), %xmm0
+; AVX2-NEXT:    retq
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64
+  %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %m, <2 x i64> zeroinitializer)
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @gather_avx_dq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_dq_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpgatherdq %ymm2, (%rsi,%xmm0,8), %ymm1
+; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %m64 = sext <4 x i1> %m to <4 x i64>
+  %res = tail call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> zeroinitializer, ptr %data, <4 x i32> %indices, <4 x i64> %m64, i8 8)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @gather_portable_dq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_dq_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64
+  %res = tail call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %m, <4 x i64> zeroinitializer)
+  ret <4 x i64> %res
+}
+
+define <2 x i64> @gather_avx_qq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_qq_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,2]
+; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm0, %xmm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rsi, %rdi
+; AVX2-NEXT:    movl $8, %esi
+; AVX2-NEXT:    jmp llvm.x86.avx2.gather.q.q.128@PLT # TAILCALL
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %m64 = sext <2 x i1> %m to <2 x i64>
+  %res = tail call <2 x i64> @llvm.x86.avx2.gather.q.q.128(<2 x i64> zeroinitializer, ptr %data, <2 x i64> %idx64, <2 x i64> %m64, i8 8)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @gather_portable_qq_128(<2 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_qq_128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    negq %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    andb $2, %dil
+; AVX2-NEXT:    shrb %dil
+; AVX2-NEXT:    movzbl %dil, %eax
+; AVX2-NEXT:    negq %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqq %xmm1, (%rsi,%xmm2,8), %xmm0
+; AVX2-NEXT:    retq
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64
+  %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %ptrs, i32 8, <2 x i1> %m, <2 x i64> zeroinitializer)
+  ret <2 x i64> %res
+}
+
+define <4 x i64> @gather_avx_qq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_avx_qq_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %m64 = sext <4 x i1> %m to <4 x i64>
+  %res = tail call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> zeroinitializer, ptr %data, <4 x i64> %idx64, <4 x i64> %m64, i8 8)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @gather_portable_qq_256(<4 x i32> %indices, i8 %maskbits, ptr noundef readonly %data) nounwind {
+; AVX2-LABEL: gather_portable_qq_256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8]
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqq %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpgatherqq %ymm2, (%rsi,%ymm1,8), %ymm0
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64
+  %res = tail call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %ptrs, i32 8, <4 x i1> %m, <4 x i64> zeroinitializer)
+  ret <4 x i64> %res
+}
+
+declare <4 x i32> @llvm.x86.avx2.gather.d.d.128(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8)
+
+declare <2 x i32> @llvm.x86.avx2.gather.q.d.128(<2 x i32>, ptr, <2 x i64>, <2 x i32>, i8)
+declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8)
+
+declare <2 x i64> @llvm.x86.avx2.gather.d.q.128(<2 x i64>, ptr, <2 x i32>, <2 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8)
+
+declare <2 x i64> @llvm.x86.avx2.gather.q.q.128(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8)
+
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x i32>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
+
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x i64>)
+
+define void @scatter_portable_i32_4(<4 x i32> %values, <4 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind {
+; AVX2-LABEL: scatter_portable_i32_4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpsllq $2, %ymm1, %ymm1
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    jne .LBB16_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    jne .LBB16_3
+; AVX2-NEXT:  .LBB16_4: # %else2
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    jne .LBB16_5
+; AVX2-NEXT:  .LBB16_6: # %else4
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    jne .LBB16_7
+; AVX2-NEXT:  .LBB16_8: # %else6
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB16_1: # %cond.store
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovss %xmm0, (%rax)
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    je .LBB16_4
+; AVX2-NEXT:  .LBB16_3: # %cond.store1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    je .LBB16_6
+; AVX2-NEXT:  .LBB16_5: # %cond.store3
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rax)
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    je .LBB16_8
+; AVX2-NEXT:  .LBB16_7: # %cond.store5
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i32, ptr %data, <4 x i64> %idx64
+  tail call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %values, <4 x ptr> %ptrs, i32 4, <4 x i1> %m)
+  ret void
+}
+
+define void @scatter_portable_i32_8(<8 x i32> %values, <8 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind {
+; AVX2-LABEL: scatter_portable_i32_8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT:    vpsllq $2, %ymm3, %ymm3
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm3
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    jne .LBB17_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    jne .LBB17_3
+; AVX2-NEXT:  .LBB17_4: # %else2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    jne .LBB17_5
+; AVX2-NEXT:  .LBB17_6: # %else4
+; AVX2-NEXT:    vpsllq $2, %ymm4, %ymm3
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    jne .LBB17_7
+; AVX2-NEXT:  .LBB17_8: # %else6
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    testb $16, %dil
+; AVX2-NEXT:    jne .LBB17_9
+; AVX2-NEXT:  .LBB17_10: # %else8
+; AVX2-NEXT:    testb $32, %dil
+; AVX2-NEXT:    jne .LBB17_11
+; AVX2-NEXT:  .LBB17_12: # %else10
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $64, %dil
+; AVX2-NEXT:    jne .LBB17_13
+; AVX2-NEXT:  .LBB17_14: # %else12
+; AVX2-NEXT:    testb $-128, %dil
+; AVX2-NEXT:    jne .LBB17_15
+; AVX2-NEXT:  .LBB17_16: # %else14
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB17_1: # %cond.store
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    vmovss %xmm0, (%rax)
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    je .LBB17_4
+; AVX2-NEXT:  .LBB17_3: # %cond.store1
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    je .LBB17_6
+; AVX2-NEXT:  .LBB17_5: # %cond.store3
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rax)
+; AVX2-NEXT:    vpsllq $2, %ymm4, %ymm3
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    je .LBB17_8
+; AVX2-NEXT:  .LBB17_7: # %cond.store5
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rax)
+; AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    testb $16, %dil
+; AVX2-NEXT:    je .LBB17_10
+; AVX2-NEXT:  .LBB17_9: # %cond.store7
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovss %xmm0, (%rax)
+; AVX2-NEXT:    testb $32, %dil
+; AVX2-NEXT:    je .LBB17_12
+; AVX2-NEXT:  .LBB17_11: # %cond.store9
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vextractps $1, %xmm0, (%rax)
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $64, %dil
+; AVX2-NEXT:    je .LBB17_14
+; AVX2-NEXT:  .LBB17_13: # %cond.store11
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vextractps $2, %xmm0, (%rax)
+; AVX2-NEXT:    testb $-128, %dil
+; AVX2-NEXT:    je .LBB17_16
+; AVX2-NEXT:  .LBB17_15: # %cond.store13
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vextractps $3, %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m  = bitcast i8 %maskbits to <8 x i1>
+  %idx64 = zext <8 x i32> %indices to <8 x i64>
+  %ptrs = getelementptr i32, ptr %data, <8 x i64> %idx64
+  tail call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %values, <8 x ptr> %ptrs, i32 4, <8 x i1> %m)
+  ret void
+}
+
+define void @scatter_portable_i64_2(<2 x i64> %values, <2 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind {
+; AVX2-LABEL: scatter_portable_i64_2:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX2-NEXT:    vpsllq $3, %xmm1, %xmm1
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX2-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    jne .LBB18_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    jne .LBB18_3
+; AVX2-NEXT:  .LBB18_4: # %else2
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB18_1: # %cond.store
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    je .LBB18_4
+; AVX2-NEXT:  .LBB18_3: # %cond.store1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, (%rax)
+; AVX2-NEXT:    retq
+  %m2 = trunc i8 %maskbits to i2
+  %m  = bitcast i2 %m2 to <2 x i1>
+  %idx64 = zext <2 x i32> %indices to <2 x i64>
+  %ptrs = getelementptr i64, ptr %data, <2 x i64> %idx64
+  tail call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> %values, <2 x ptr> %ptrs, i32 8, <2 x i1> %m)
+  ret void
+}
+
+define void @scatter_portable_i64_4(<4 x i64> %values, <4 x i32> %indices, i8 %maskbits, ptr noundef %data) nounwind {
+; AVX2-LABEL: scatter_portable_i64_4:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT:    vpsllq $3, %ymm1, %ymm1
+; AVX2-NEXT:    vmovq %rsi, %xmm2
+; AVX2-NEXT:    vpbroadcastq %xmm2, %ymm2
+; AVX2-NEXT:    vpaddq %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    testb $1, %dil
+; AVX2-NEXT:    jne .LBB19_1
+; AVX2-NEXT:  # %bb.2: # %else
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    jne .LBB19_3
+; AVX2-NEXT:  .LBB19_4: # %else2
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    jne .LBB19_5
+; AVX2-NEXT:  .LBB19_6: # %else4
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    jne .LBB19_7
+; AVX2-NEXT:  .LBB19_8: # %else6
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+; AVX2-NEXT:  .LBB19_1: # %cond.store
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    testb $2, %dil
+; AVX2-NEXT:    je .LBB19_4
+; AVX2-NEXT:  .LBB19_3: # %cond.store1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, (%rax)
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT:    testb $4, %dil
+; AVX2-NEXT:    je .LBB19_6
+; AVX2-NEXT:  .LBB19_5: # %cond.store3
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm0, (%rax)
+; AVX2-NEXT:    testb $8, %dil
+; AVX2-NEXT:    je .LBB19_8
+; AVX2-NEXT:  .LBB19_7: # %cond.store5
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m4 = trunc i8 %maskbits to i4
+  %m  = bitcast i4 %m4 to <4 x i1>
+  %idx64 = zext <4 x i32> %indices to <4 x i64>
+  %ptrs = getelementptr i64, ptr %data, <4 x i64> %idx64
+  tail call void @llvm.masked.scatter.v4i64.v4p0(<4 x i64> %values, <4 x ptr> %ptrs, i32 8, <4 x i1> %m)
+  ret void
+}
+
+
+declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32 immarg, <4 x i1>)
+declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
+declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64>, <2 x ptr>, i32 immarg, <2 x i1>)
+declare void @llvm.masked.scatter.v4i64.v4p0(<4 x i64>, <4 x ptr>, i32 immarg, <4 x i1>)