diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 45a22f416dce1..79ea01d959312 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -27809,6 +27809,55 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
+.. _int_mloadff:
+
+'``llvm.masked.load.ff.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer,
+floating-point or pointer data type.
+
+::
+
+      declare { <16 x float>, <16 x i1> } @llvm.masked.load.ff.v16f32.p0(ptr <ptr>, <16 x i1> <mask>)
+      declare { <2 x double>, <2 x i1> } @llvm.masked.load.ff.v2f64.p0(ptr <ptr>, <2 x i1> <mask>)
+      ;; The data is a vector of pointers
+      declare { <8 x ptr>, <8 x i1> } @llvm.masked.load.ff.v8p0.p0(ptr align 8 <ptr>, <8 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Reads a vector from memory according to the provided mask, suppressing faults
+for any lane beyond the first. The mask holds a bit for each vector lane, and
+is used to prevent memory accesses to the masked-off lanes.
+
+Returns the loaded data and a mask indicating which lanes are valid, which may
+not be the same as the input mask depending on whether the processor encountered
+a reason to avoid loading from that address. Invalid lanes contain poison
+values.
+
+Arguments:
+""""""""""
+
+The first argument is the base pointer for the load. The second argument, mask,
+is a vector of boolean values with the same number of elements as the return
+type.
+
+The :ref:`align <attr_align>` parameter attribute can be provided for the first
+argument.
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.load.ff``' intrinsic is very similar to the
+'``llvm.vp.load.ff``' intrinsic, with the differences being the lack of an EVL
+parameter and the second returned value being a mask instead of an updated EVL
+value.
+
+If the processor suppresses a fault for any lane, then the returned mask will
+indicate that lane and all subsequent lanes are inactive.
 
 Memory Use Markers
 ------------------
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4469ff155b854..1d9267487a886 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2598,6 +2598,13 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_load_ff:
+  DefaultAttrsIntrinsic<[llvm_anyvector_ty,
+                         LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                         [llvm_anyptr_ty,
+                          LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+                         [IntrReadMem, IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
+
 def int_experimental_vector_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index da9ceb4f440e5..276685ea041a6 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -1040,6 +1040,85 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI,
   ModifiedDT = true;
 }
 
+static void scalarizeMaskedFirstFaultingLoad(const DataLayout &DL, CallInst *CI,
+                                             DomTreeUpdater *DTU,
+                                             bool &ModifiedDT) {
+  // For a target without first-faulting load support, we can't actually
+  // scalarize accesses for all lanes. However, lanes beyond the first may be
+  // considered inactive due to reasons beyond a fault, so for generic
+  // 'scalarization' we can just load the first lane (if the corresponding
+  // input mask bit is active), then mark all other lanes as inactive in the
+  // output mask and embed the first lane into a vector of poison.
+  Value *Ptr = CI->getArgOperand(0);
+  MaybeAlign AlignVal = CI->getParamAlign(0);
+  Value *Mask = CI->getArgOperand(1);
+  StructType *RetTy = cast<StructType>(CI->getType());
+  VectorType *DataTy = cast<VectorType>(RetTy->getElementType(0));
+  VectorType *MaskTy = cast<VectorType>(RetTy->getElementType(1));
+  Type *ScalarTy = DataTy->getScalarType();
+
+  IRBuilder<> Builder(CI->getContext());
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(CI);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  Value *EmptyMask = Constant::getNullValue(MaskTy);
+  Value *PoisonData = PoisonValue::get(DataTy);
+
+  // First create a check to determine whether the first lane is active
+  //
+  // %first.active = extractelement <N x i1> %mask, i64 0
+  // br i1 %first.active, label %load.ff.first.lane, label %load.ff.result
+  Value *FirstActive =
+      Builder.CreateExtractElement(Mask, uint64_t(0ull), Twine("first.active"));
+  Instruction *ThenTerm =
+      SplitBlockAndInsertIfThen(FirstActive, CI,
+                                /*Unreachable=*/false,
+                                /*BranchWeights=*/nullptr, DTU);
+
+  // If the first mask lane was active, then we want a real load of one element
+  // into the first element of a vector, with the rest being poison.
+  //
+  // load.ff.first.lane:
+  // %ld.first = load ty, ptr %Ptr
+  // %lane = insertelement <N x ty> poison, ty %ld.first, i64 0
+  // br label %load.ff.result
+  BasicBlock *ThenBlock = ThenTerm->getParent();
+  ThenBlock->setName("load.ff.first.lane");
+  Builder.SetInsertPoint(ThenBlock->getTerminator());
+  LoadInst *Load = Builder.CreateAlignedLoad(ScalarTy, Ptr, AlignVal);
+  Value *OneLaneData =
+      Builder.CreateInsertElement(PoisonData, Load, uint64_t(0ull));
+  Value *OneLaneMask = Builder.CreateInsertElement(
+      EmptyMask, Constant::getAllOnesValue(MaskTy->getElementType()),
+      uint64_t(0ull));
+
+  // Now we just select between the two based on the check of the first lane
+  //
+  // load.ff.result:
+  // %data.res = phi <N x ty> [ poison, %orig ], [ %lane, %load.ff.first.lane ]
+  // %mask.res = phi <N x i1> [ false, %orig ], [ <true, fa...>, %ld.ff... ]
+  // %ins = insertvalue { <N x ty>, <N x i1> } poison, <N x ty> %data.res, 0
+  // %first.lane.only = insertvalue { <N x ty>, <N x i1> } %ins, <N x i1> ...,1
+  // ... replace all intrinsic uses with %first.lane.only
+  Builder.SetInsertPoint(CI);
+  PHINode *ResData = Builder.CreatePHI(DataTy, 2);
+  ResData->addIncoming(PoisonData, IfBlock);
+  ResData->addIncoming(OneLaneData, ThenBlock);
+  PHINode *ResMask = Builder.CreatePHI(MaskTy, 2);
+  ResMask->addIncoming(EmptyMask, IfBlock);
+  ResMask->addIncoming(OneLaneMask, ThenBlock);
+
+  Value *Result = PoisonValue::get(RetTy);
+  Result = Builder.CreateInsertValue(Result, ResData, 0ul);
+  Result = Builder.CreateInsertValue(Result, ResMask, 1ul);
+  if (CI->hasName())
+    Result->setName(CI->getName() + ".first.lane.only");
+  CI->getParent()->setName("load.ff.result");
+  CI->replaceAllUsesWith(Result);
+  CI->eraseFromParent();
+  ModifiedDT = true;
+}
+
 static bool runImpl(Function &F, const TargetTransformInfo &TTI,
                     DominatorTree *DT) {
   std::optional<DomTreeUpdater> DTU;
@@ -1110,11 +1189,18 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
                              DomTreeUpdater *DTU) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II) {
-    // The scalarization code below does not work for scalable vectors.
+    // The scalarization code below does not work for scalable vectors, except
+    // for first faulting loads, which only need to deal with the first element.
     if (isa<ScalableVectorType>(II->getType()) ||
-        any_of(II->args(),
-               [](Value *V) { return isa<ScalableVectorType>(V->getType()); }))
+        any_of(II->args(), [](Value *V) {
+          return isa<ScalableVectorType>(V->getType());
+        })) {
+      if (II->getIntrinsicID() == Intrinsic::masked_load_ff) {
+        scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT);
+        return true;
+      }
       return false;
+    }
     switch (II->getIntrinsicID()) {
     default:
       break;
@@ -1185,6 +1271,10 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       scalarizeMaskedCompressStore(DL, HasBranchDivergence, CI, DTU,
                                    ModifiedDT);
       return true;
+    case Intrinsic::masked_load_ff: {
+      scalarizeMaskedFirstFaultingLoad(DL, CI, DTU, ModifiedDT);
+      return true;
+    }
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll
new file mode 100644
index 0000000000000..0b8f831480f94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-load-first-faulting.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O3 -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefix=NEON
+; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=SVE
+; RUN: llc -O3 -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefix=SME_STREAMING
+
+define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) {
+; NEON-LABEL: load_ff_v4i32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; NEON-NEXT:    umov w8, v0.h[0]
+; NEON-NEXT:    tbz w8, #0, .LBB0_2
+; NEON-NEXT:  // %bb.1: // %load.ff.first.lane
+; NEON-NEXT:    mov w8, #1 // =0x1
+; NEON-NEXT:    ldr s0, [x0]
+; NEON-NEXT:    fmov d1, x8
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+; NEON-NEXT:  .LBB0_2:
+; NEON-NEXT:    movi v1.2d, #0000000000000000
+; NEON-NEXT:    // implicit-def: $q0
+; NEON-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: load_ff_v4i32:
+; SVE:       // %bb.0:
+; SVE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SVE-NEXT:    umov w8, v0.h[0]
+; SVE-NEXT:    tbz w8, #0, .LBB0_2
+; SVE-NEXT:  // %bb.1: // %load.ff.first.lane
+; SVE-NEXT:    mov w8, #1 // =0x1
+; SVE-NEXT:    ldr s0, [x0]
+; SVE-NEXT:    fmov d1, x8
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+; SVE-NEXT:  .LBB0_2:
+; SVE-NEXT:    movi v1.2d, #0000000000000000
+; SVE-NEXT:    // implicit-def: $q0
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $q1
+; SVE-NEXT:    ret
+;
+; SME_STREAMING-LABEL: load_ff_v4i32:
+; SME_STREAMING:       // %bb.0:
+; SME_STREAMING-NEXT:    fmov w8, s0
+; SME_STREAMING-NEXT:    tbz w8, #0, .LBB0_2
+; SME_STREAMING-NEXT:  // %bb.1: // %load.ff.first.lane
+; SME_STREAMING-NEXT:    ptrue p0.s
+; SME_STREAMING-NEXT:    adrp x8, .LCPI0_1
+; SME_STREAMING-NEXT:    ldr d1, [x8, :lo12:.LCPI0_1]
+; SME_STREAMING-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; SME_STREAMING-NEXT:    ret
+; SME_STREAMING-NEXT:  .LBB0_2:
+; SME_STREAMING-NEXT:    mov z1.h, #0 // =0x0
+; SME_STREAMING-NEXT:    adrp x8, .LCPI0_0
+; SME_STREAMING-NEXT:    ldr q0, [x8, :lo12:.LCPI0_0]
+; SME_STREAMING-NEXT:    ret
+  %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask)
+  ret { <4 x i32>, <4 x i1> } %res
+}
+
+define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_fully_aligned(ptr %p) {
+; NEON-LABEL: load_ff_v2f64_all_true_fully_aligned:
+; NEON:       // %bb.0: // %load.ff.first.lane
+; NEON-NEXT:    mov w8, #1 // =0x1
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    fmov d1, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: load_ff_v2f64_all_true_fully_aligned:
+; SVE:       // %bb.0: // %load.ff.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+;
+; SME_STREAMING-LABEL: load_ff_v2f64_all_true_fully_aligned:
+; SME_STREAMING:       // %bb.0: // %load.ff.first.lane
+; SME_STREAMING-NEXT:    ptrue p0.d
+; SME_STREAMING-NEXT:    index z1.s, #1, #-1
+; SME_STREAMING-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; SME_STREAMING-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}
+
+define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true_partially_aligned(ptr %p) {
+; NEON-LABEL: load_ff_v2f64_all_true_partially_aligned:
+; NEON:       // %bb.0: // %load.ff.first.lane
+; NEON-NEXT:    mov w8, #1 // =0x1
+; NEON-NEXT:    ldr d0, [x0]
+; NEON-NEXT:    fmov d1, x8
+; NEON-NEXT:    ret
+;
+; SVE-LABEL: load_ff_v2f64_all_true_partially_aligned:
+; SVE:       // %bb.0: // %load.ff.first.lane
+; SVE-NEXT:    ldr d0, [x0]
+; SVE-NEXT:    index z1.s, #1, #-1
+; SVE-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; SVE-NEXT:    ret
+;
+; SME_STREAMING-LABEL: load_ff_v2f64_all_true_partially_aligned:
+; SME_STREAMING:       // %bb.0: // %load.ff.first.lane
+; SME_STREAMING-NEXT:    ptrue p0.d
+; SME_STREAMING-NEXT:    index z1.s, #1, #-1
+; SME_STREAMING-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; SME_STREAMING-NEXT:    ret
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 8 %p, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}
diff --git a/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll
new file mode 100644
index 0000000000000..3c74ab15f8e23
--- /dev/null
+++ b/llvm/test/Transforms/ScalarizeMaskedMemIntrin/expand-masked-load-first-fault.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p scalarize-masked-mem-intrin -S < %s | FileCheck %s
+
+define { <4 x i32>, <4 x i1> } @load_ff_v4i32(ptr %p, <4 x i1> %mask) {
+; CHECK-LABEL: define { <4 x i32>, <4 x i1> } @load_ff_v4i32(
+; CHECK-SAME: ptr [[P:%.*]], <4 x i1> [[MASK:%.*]]) {
+; CHECK-NEXT:    [[FIRST_ACTIVE:%.*]] = extractelement <4 x i1> [[MASK]], i64 0
+; CHECK-NEXT:    br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]]
+; CHECK:       [[LOAD_FF_FIRST_LANE]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0
+; CHECK-NEXT:    br label %[[LOAD_FF_RESULT]]
+; CHECK:       [[LOAD_FF_RESULT]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <4 x i1> [ zeroinitializer, [[TMP0]] ], [ <i1 true, i1 false, i1 false, i1 false>, %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <4 x i32>, <4 x i1> } poison, <4 x i32> [[TMP3]], 0
+; CHECK-NEXT:    [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <4 x i32>, <4 x i1> } [[TMP5]], <4 x i1> [[TMP4]], 1
+; CHECK-NEXT:    ret { <4 x i32>, <4 x i1> } [[RES_FIRST_LANE_ONLY]]
+;
+  %res = call { <4 x i32>, <4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <4 x i1> %mask)
+  ret { <4 x i32>, <4 x i1> } %res
+}
+
+;; We can 'scalarize' first faulting loads for scalable vectors, since we only
+;; need to insert a single element into the start of a poison splat vector.
+define { <vscale x 4 x i32>, <vscale x 4 x i1> } @load_ff_nxv4i32(ptr %p, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i1> } @load_ff_nxv4i32(
+; CHECK-SAME: ptr [[P:%.*]], <vscale x 4 x i1> [[MASK:%.*]]) {
+; CHECK-NEXT:    [[FIRST_ACTIVE:%.*]] = extractelement <vscale x 4 x i1> [[MASK]], i64 0
+; CHECK-NEXT:    br i1 [[FIRST_ACTIVE]], label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]]
+; CHECK:       [[LOAD_FF_FIRST_LANE]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP1]], i64 0
+; CHECK-NEXT:    br label %[[LOAD_FF_RESULT]]
+; CHECK:       [[LOAD_FF_RESULT]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <vscale x 4 x i32> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[TMP0]] ], [ insertelement (<vscale x 4 x i1> zeroinitializer, i1 true, i64 0), %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } poison, <vscale x 4 x i32> [[TMP3]], 0
+; CHECK-NEXT:    [[RES:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i1> } [[TMP5]], <vscale x 4 x i1> [[TMP4]], 1
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i1> } [[RES]]
+;
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <vscale x 4 x i1> %mask)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i1> } %res
+}
+
+define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(ptr %p) {
+; CHECK-LABEL: define { <2 x double>, <2 x i1> } @load_ff_v2f64_all_true(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    br i1 true, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]]
+; CHECK:       [[LOAD_FF_FIRST_LANE]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[P]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0
+; CHECK-NEXT:    br label %[[LOAD_FF_RESULT]]
+; CHECK:       [[LOAD_FF_RESULT]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x double> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x i1> [ zeroinitializer, [[TMP0]] ], [ <i1 true, i1 false>, %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <2 x double>, <2 x i1> } poison, <2 x double> [[TMP3]], 0
+; CHECK-NEXT:    [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <2 x double>, <2 x i1> } [[TMP5]], <2 x i1> [[TMP4]], 1
+; CHECK-NEXT:    ret { <2 x double>, <2 x i1> } [[RES_FIRST_LANE_ONLY]]
+;
+  %res = call { <2 x double>, <2 x i1> } @llvm.masked.load.ff(ptr align 16 %p, <2 x i1> <i1 true, i1 true>)
+  ret { <2 x double>, <2 x i1> } %res
+}
+
+define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(ptr %p) {
+; CHECK-LABEL: define { <16 x i16>, <16 x i1> } @load_ff_v16i16_all_false(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    br i1 false, label %[[LOAD_FF_FIRST_LANE:.*]], label %[[LOAD_FF_RESULT:.*]]
+; CHECK:       [[LOAD_FF_FIRST_LANE]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[P]], align 32
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[TMP1]], i64 0
+; CHECK-NEXT:    br label %[[LOAD_FF_RESULT]]
+; CHECK:       [[LOAD_FF_RESULT]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <16 x i16> [ poison, [[TMP0:%.*]] ], [ [[TMP2]], %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i1> [ zeroinitializer, [[TMP0]] ], [ <i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, %[[LOAD_FF_FIRST_LANE]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertvalue { <16 x i16>, <16 x i1> } poison, <16 x i16> [[TMP3]], 0
+; CHECK-NEXT:    [[RES_FIRST_LANE_ONLY:%.*]] = insertvalue { <16 x i16>, <16 x i1> } [[TMP5]], <16 x i1> [[TMP4]], 1
+; CHECK-NEXT:    ret { <16 x i16>, <16 x i1> } [[RES_FIRST_LANE_ONLY]]
+;
+  %res = call { <16 x i16>, <16 x i1> } @llvm.masked.load.ff(ptr align 32 %p, <16 x i1> zeroinitializer)
+  ret { <16 x i16>, <16 x i1> } %res
+}