diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cfae4393f2fb2..6e5d53172a493 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -27689,6 +27689,130 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
 
+Speculative Load Intrinsics
+---------------------------
+
+LLVM provides intrinsics for speculatively loading memory that may be
+out-of-bounds. These intrinsics enable optimizations like early-exit loop
+vectorization where the vectorized loop may read beyond the end of an array,
+provided the access is guaranteed to not trap by target-specific checks.
+
+.. _int_speculative_load:
+
+'``llvm.speculative.load``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x float>  @llvm.speculative.load.v4f32.p0(ptr <ptr>)
+      declare <8 x i32>    @llvm.speculative.load.v8i32.p0(ptr <ptr>)
+      declare i64          @llvm.speculative.load.i64.p0(ptr <ptr>)
+
+Overview:
+"""""""""
+
+The '``llvm.speculative.load``' intrinsic loads a value from memory. Unlike a
+regular load, the memory access may extend beyond the bounds of the allocated
+object, provided the pointer has been verified by
+:ref:`llvm.can.load.speculatively <int_can_load_speculatively>` to ensure the
+access cannot fault.
+
+Arguments:
+""""""""""
+
+The argument is a pointer to the memory location to load from. The return type
+must have a power-of-2 size in bytes.
+
+Semantics:
+""""""""""
+
+The '``llvm.speculative.load``' intrinsic performs a load that may access
+memory beyond what is accessible through the pointer. It must be used in
+combination with :ref:`llvm.can.load.speculatively <int_can_load_speculatively>`
+to ensure the access can be performed speculatively.
+
+A byte at ``ptr + i`` is *accessible through* ``ptr`` if both of the following
+hold:
+
+1. The byte lies within the bounds of an allocated object that ``ptr`` is
+   :ref:`based <pointeraliasing>` on.
+2. Accessing the byte through ``ptr`` does not violate any ``noalias``
+   constraints.
+
+For accessible bytes, the intrinsic returns the stored value. For inaccessible
+bytes, the intrinsic returns ``poison`` and the bytes are not considered accessed
+for the purpose of data races or ``noalias`` constraints. At least the first
+byte must be accessible; otherwise the behavior is undefined.
+
+The behavior is undefined if program execution depends on any byte in the
+result that may not be accessible.
+
+The behavior is undefined if this intrinsic is used to load from a pointer
+for which ``llvm.can.load.speculatively`` would return false.
+
+.. _int_can_load_speculatively:
+
+'``llvm.can.load.speculatively``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i1 @llvm.can.load.speculatively.p0(ptr <ptr>, i64 <num_bytes>)
+      declare i1 @llvm.can.load.speculatively.p1(ptr addrspace(1) <ptr>, i64 <num_bytes>)
+
+Overview:
+"""""""""
+
+The '``llvm.can.load.speculatively``' intrinsic returns true if it is safe
+to speculatively load ``num_bytes`` bytes starting from ``ptr``,
+even if the memory may be beyond the bounds of an allocated object.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the memory location.
+
+The second argument is an i64 specifying the size in bytes of the load.
+The size must be a positive power of 2.  If the size is not a power-of-2, the
+result is ``poison``.
+
+Semantics:
+""""""""""
+
+This intrinsic has **target-dependent** semantics. It returns ``true`` if
+``num_bytes`` bytes starting at ``ptr + I * num_bytes``, for all non-negative
+integers ``I`` where the computed address does not wrap around the address
+space, can be loaded speculatively, even if the memory is beyond the bounds of
+an allocated object. It returns ``false`` otherwise.
+
+The specific conditions under which this intrinsic returns ``true`` are
+determined by the target. For example, a target may check whether the pointer
+alignment guarantees all such loads cannot cross a page boundary.
+
+.. code-block:: llvm
+
+    ; Check if we can safely load 16 bytes from %ptr
+    %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+    br i1 %can_load, label %speculative_path, label %safe_path
+
+    speculative_path:
+      ; Safe to speculatively load from %ptr
+      %vec = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %ptr)
+      ...
+
+    safe_path:
+      ; Fall back to masked load or scalar operations
+      ...
+
+
 Memory Use Markers
 ------------------
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 454be56aed6cc..e4be1daa2aa90 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -910,6 +910,8 @@ class TargetTransformInfoImplBase {
     switch (ICA.getID()) {
     default:
       break;
+    case Intrinsic::speculative_load:
+      return InstructionCost::getInvalid();
     case Intrinsic::allow_runtime_check:
     case Intrinsic::allow_ubsan_check:
     case Intrinsic::annotation:
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 68874c59be4b8..85963bb5c12e8 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1994,6 +1994,9 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       // The cost of materialising a constant integer vector.
       return TargetTransformInfo::TCC_Basic;
     }
+    case Intrinsic::speculative_load:
+      // Delegate to base; targets must opt-in with a valid cost.
+      return BaseT::getIntrinsicInstrCost(ICA, CostKind);
     case Intrinsic::vector_extract: {
       // FIXME: Handle case where a scalable vector is extracted from a scalable
       // vector
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 37002d3bc227f..d69b070aa0268 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2292,6 +2292,19 @@ class LLVM_ABI TargetLoweringBase {
     llvm_unreachable("Store conditional unimplemented on this target");
   }
 
+  /// Emit code to check if a speculative load of the given size from Ptr is
+  /// safe. Returns a Value* representing the check result (i1), or nullptr
+  /// to use the default lowering (which returns false). Targets can override
+  /// to provide their own safety check (e.g., alignment-based page boundary
+  /// check).
+  /// \param Builder IRBuilder positioned at the intrinsic call site
+  /// \param Ptr the pointer operand
+  /// \param Size the size in bytes (constant or runtime value for scalable)
+  virtual Value *emitCanLoadSpeculatively(IRBuilderBase &Builder, Value *Ptr,
+                                          Value *Size) const {
+    return nullptr;
+  }
+
   /// Perform a masked atomicrmw using a target-specific intrinsic. This
   /// represents the core LL/SC loop which will be lowered at a late stage by
   /// the backend. The target-specific intrinsic returns the loaded value and
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index a1c91486f7c3c..c417d5de1a97e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2603,6 +2603,20 @@ def int_experimental_vector_compress:
               [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
               [IntrNoMem]>;
 
+// Speculatively load a value from memory; lowers to a regular aligned load.
+// The loaded type must have a power-of-2 size.
+def int_speculative_load:
+  DefaultAttrsIntrinsic<[llvm_any_ty],
+            [llvm_anyptr_ty],
+            [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>;
+
+// Returns true if it's safe to speculatively load 'num_bytes' from 'ptr'.
+// The size can be a runtime value to support scalable vectors.
+def int_can_load_speculatively:
+  DefaultAttrsIntrinsic<[llvm_i1_ty],
+            [llvm_anyptr_ty, llvm_i64_ty],
+            [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrSpeculatable]>;
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 0544995f979f7..167db2a197095 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -136,6 +136,39 @@ static bool lowerLoadRelative(Function &F) {
   return Changed;
 }
 
+/// Lower @llvm.can.load.speculatively using target-specific expansion.
+/// Each target provides its own expansion via
+/// TargetLowering::emitCanLoadSpeculatively.
+/// The default expansion returns false (conservative).
+static bool lowerCanLoadSpeculatively(Function &F, const TargetMachine *TM) {
+  bool Changed = false;
+
+  for (Use &U : llvm::make_early_inc_range(F.uses())) {
+    auto *CI = dyn_cast<CallInst>(U.getUser());
+    if (!CI || CI->getCalledOperand() != &F)
+      continue;
+
+    Function *ParentFunc = CI->getFunction();
+    const TargetLowering *TLI =
+        TM->getSubtargetImpl(*ParentFunc)->getTargetLowering();
+
+    IRBuilder<> Builder(CI);
+    Value *Ptr = CI->getArgOperand(0);
+    Value *Size = CI->getArgOperand(1);
+
+    // Ask target for expansion; nullptr means use default (return false)
+    Value *Result = TLI->emitCanLoadSpeculatively(Builder, Ptr, Size);
+    if (!Result)
+      Result = Builder.getFalse();
+
+    CI->replaceAllUsesWith(Result);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 // ObjCARC has knowledge about whether an obj-c runtime function needs to be
 // always tail-called or never tail-called.
 static CallInst::TailCallKind getOverridingTailCallKind(const Function &F) {
@@ -692,6 +725,9 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::load_relative:
       Changed |= lowerLoadRelative(F);
       break;
+    case Intrinsic::can_load_speculatively:
+      Changed |= lowerCanLoadSpeculatively(F, TM);
+      break;
     case Intrinsic::is_constant:
     case Intrinsic::objectsize:
       Changed |= forEachCall(F, [&](CallInst *CI) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2d20fe5d48517..98e88c127fa51 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5128,6 +5128,33 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   setValue(&I, Res);
 }
 
+void SelectionDAGBuilder::visitSpeculativeLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+  Value *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  Align Alignment = I.getParamAlign(0).valueOrOne();
+  AAMDNodes AAInfo = I.getAAMetadata();
+  TypeSize StoreSize = VT.getStoreSize();
+
+  SDValue InChain = DAG.getRoot();
+
+  // Use MOLoad but NOT MODereferenceable - the memory may not be
+  // fully dereferenceable.
+  MachineMemOperand::Flags MMOFlags = MachineMemOperand::MOLoad;
+  LocationSize LocSize = StoreSize.isScalable()
+                             ? LocationSize::beforeOrAfterPointer()
+                             : LocationSize::precise(StoreSize);
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MMOFlags, LocSize, Alignment, AAInfo);
+
+  SDValue Load = DAG.getLoad(VT, sdl, InChain, Ptr, MMO);
+  PendingLoads.push_back(Load.getValue(1));
+  setValue(&I, Load);
+}
+
 void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDLoc sdl = getCurSDLoc();
 
@@ -6883,6 +6910,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return;
+  case Intrinsic::speculative_load:
+    visitSpeculativeLoad(I);
+    return;
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f8aecea25b3d6..dad406f48b77b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -619,6 +619,7 @@ class SelectionDAGBuilder {
   void visitStore(const StoreInst &I);
   void visitMaskedLoad(const CallInst &I, bool IsExpanding = false);
   void visitMaskedStore(const CallInst &I, bool IsCompressing = false);
+  void visitSpeculativeLoad(const CallInst &I);
   void visitMaskedGather(const CallInst &I);
   void visitMaskedScatter(const CallInst &I);
   void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index f986f5406b2b3..a65c662b03503 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6753,6 +6753,24 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
           &Call);
     break;
   }
+  case Intrinsic::speculative_load: {
+    Type *LoadTy = Call.getType();
+    TypeSize Size = DL.getTypeStoreSize(LoadTy);
+    // For scalable vectors, check the known minimum size is a power of 2.
+    Check(Size.getKnownMinValue() > 0 && isPowerOf2_64(Size.getKnownMinValue()),
+          "llvm.speculative.load type must have a power-of-2 size", &Call);
+    break;
+  }
+  case Intrinsic::can_load_speculatively: {
+    // If size is a constant, verify it's a positive power of 2.
+    if (auto *SizeCI = dyn_cast<ConstantInt>(Call.getArgOperand(1))) {
+      uint64_t Size = SizeCI->getZExtValue();
+      Check(Size > 0 && isPowerOf2_64(Size),
+            "llvm.can.load.speculatively size must be a positive power of 2",
+            &Call);
+    }
+    break;
+  }
   case Intrinsic::vector_insert: {
     Value *Vec = Call.getArgOperand(0);
     Value *SubVec = Call.getArgOperand(1);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 713b40d97c9fd..5f682a1959152 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30507,6 +30507,56 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   return CI;
 }
 
+Value *AArch64TargetLowering::emitCanLoadSpeculatively(IRBuilderBase &Builder,
+                                                       Value *Ptr,
+                                                       Value *Size) const {
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  // Conservatively only allow speculation for address space 0.
+  if (AS != 0)
+    return nullptr;
+  // For power-of-2 sizes <= 16, emit alignment check: (ptr & (size - 1)) == 0.
+  // If the pointer is aligned to at least 'size' bytes, loading 'size' bytes
+  // cannot cross a page boundary, so it's safe to speculate.
+  // The 16-byte limit ensures correctness with MTE (memory tagging), since
+  // MTE uses 16-byte tag granules.
+  //
+  // The alignment check only works for power-of-2 sizes. For non-power-of-2
+  // sizes, we conservatively return false.
+  const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
+
+  unsigned PtrBits = DL.getPointerSizeInBits(AS);
+  Type *IntPtrTy = Builder.getIntNTy(PtrBits);
+  if (auto *CI = dyn_cast<ConstantInt>(Size)) {
+    uint64_t SizeVal = CI->getZExtValue();
+    assert(isPowerOf2_64(SizeVal) && "size must be power-of-two");
+    // For constant sizes > 16, return nullptr (default false).
+    if (SizeVal > 16)
+      return nullptr;
+
+    // Power-of-2 constant size <= 16: use fast alignment check.
+    Value *PtrInt = Builder.CreatePtrToInt(Ptr, IntPtrTy);
+    Value *Mask = ConstantInt::get(IntPtrTy, SizeVal - 1);
+    Value *Masked = Builder.CreateAnd(PtrInt, Mask);
+    return Builder.CreateICmpEQ(Masked, ConstantInt::get(IntPtrTy, 0));
+  }
+
+  // Check power-of-2 size <= 16 and alignment.
+  Value *PtrInt = Builder.CreatePtrToInt(Ptr, IntPtrTy);
+  Value *SizeExt = Builder.CreateZExtOrTrunc(Size, IntPtrTy);
+
+  Value *SizeLE16 =
+      Builder.CreateICmpULE(SizeExt, ConstantInt::get(IntPtrTy, 16));
+
+  // alignment check: (ptr & (size - 1)) == 0
+  Value *SizeMinusOne =
+      Builder.CreateSub(SizeExt, ConstantInt::get(IntPtrTy, 1));
+  Value *Masked = Builder.CreateAnd(PtrInt, SizeMinusOne);
+  Value *AlignCheck =
+      Builder.CreateICmpEQ(Masked, ConstantInt::get(IntPtrTy, 0));
+
+  return Builder.CreateAnd(SizeLE16, AlignCheck);
+}
+
 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6ecea4f6e2d5e..00d57be1004f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -350,6 +350,8 @@ class AArch64TargetLowering : public TargetLowering {
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr,
                               AtomicOrdering Ord) const override;
+  Value *emitCanLoadSpeculatively(IRBuilderBase &Builder, Value *Ptr,
+                                  Value *Size) const override;
 
   void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 71f52ae55d3ec..9b1e4c50a86e2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -628,6 +628,25 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       return InstructionCost::getInvalid();
 
   switch (ICA.getID()) {
+  case Intrinsic::speculative_load: {
+    // Speculative loads are only valid for types <= 16 bytes due to MTE
+    // (Memory Tagging Extension) using 16-byte tag granules. Loads larger
+    // than 16 bytes could cross a tag granule boundary.
+    auto LT = getTypeLegalizationCost(RetTy);
+    if (!LT.first.isValid())
+      return InstructionCost::getInvalid();
+    // For scalable vectors, check that we use a single register (which means
+    // <= 16 bytes at minimum vscale). For fixed types, compute the actual size.
+    if (isa<ScalableVectorType>(RetTy)) {
+      if (LT.first.getValue() != 1)
+        return InstructionCost::getInvalid();
+    } else {
+      if (LT.first.getValue() * LT.second.getStoreSize() > 16)
+        return InstructionCost::getInvalid();
+    }
+    // Return cost of a regular load.
+    return getMemoryOpCost(Instruction::Load, RetTy, Align(1), 0, CostKind);
+  }
   case Intrinsic::experimental_vector_histogram_add: {
     InstructionCost HistCost = getHistogramCost(ST, ICA);
     // If the cost isn't valid, we may still be able to scalarize
diff --git a/llvm/test/Analysis/CostModel/AArch64/speculative-load.ll b/llvm/test/Analysis/CostModel/AArch64/speculative-load.ll
new file mode 100644
index 0000000000000..7587c3ec77b40
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/speculative-load.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 < %s | FileCheck %s --check-prefixes=COMMON,NEON
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s --check-prefixes=COMMON,SVE
+
+define void @speculative_load_cost_fixed(ptr %p) {
+  ; Scalar types - all valid (<= 16 bytes)
+; COMMON-LABEL: 'speculative_load_cost_fixed'
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call i8 @llvm.speculative.load.i8.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call i16 @llvm.speculative.load.i16.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call i32 @llvm.speculative.load.i32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call i64 @llvm.speculative.load.i64.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = call i128 @llvm.speculative.load.i128.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <2 x i32> @llvm.speculative.load.v2i32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <4 x float> @llvm.speculative.load.v4f32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <2 x double> @llvm.speculative.load.v2f64.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <8 x i8> @llvm.speculative.load.v8i8.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <16 x i8> @llvm.speculative.load.v16i8.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <4 x i16> @llvm.speculative.load.v4i16.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <8 x i16> @llvm.speculative.load.v8i16.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %15 = call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <4 x i64> @llvm.speculative.load.v4i64.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <32 x i8> @llvm.speculative.load.v32i8.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <16 x i16> @llvm.speculative.load.v16i16.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <8 x float> @llvm.speculative.load.v8f32.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <4 x double> @llvm.speculative.load.v4f64.p0(ptr %p)
+; COMMON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call i8 @llvm.speculative.load.i8.p0(ptr %p)
+  call i16 @llvm.speculative.load.i16.p0(ptr %p)
+  call i32 @llvm.speculative.load.i32.p0(ptr %p)
+  call i64 @llvm.speculative.load.i64.p0(ptr %p)
+  call i128 @llvm.speculative.load.i128.p0(ptr %p)
+
+  ; Vector types <= 16 bytes - valid
+  call <2 x i32> @llvm.speculative.load.v2i32.p0(ptr %p)
+  call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %p)
+  call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %p)
+  call <4 x float> @llvm.speculative.load.v4f32.p0(ptr %p)
+  call <2 x double> @llvm.speculative.load.v2f64.p0(ptr %p)
+  call <8 x i8> @llvm.speculative.load.v8i8.p0(ptr %p)
+  call <16 x i8> @llvm.speculative.load.v16i8.p0(ptr %p)
+  call <4 x i16> @llvm.speculative.load.v4i16.p0(ptr %p)
+  call <8 x i16> @llvm.speculative.load.v8i16.p0(ptr %p)
+
+  ; Vector types > 16 bytes - invalid
+  call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr %p)
+  call <4 x i64> @llvm.speculative.load.v4i64.p0(ptr %p)
+  call <32 x i8> @llvm.speculative.load.v32i8.p0(ptr %p)
+  call <16 x i16> @llvm.speculative.load.v16i16.p0(ptr %p)
+  call <8 x float> @llvm.speculative.load.v8f32.p0(ptr %p)
+  call <4 x double> @llvm.speculative.load.v4f64.p0(ptr %p)
+  ret void
+}
+
+define void @speculative_load_cost_scalable(ptr %p) {
+; NEON-LABEL: 'speculative_load_cost_scalable'
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %1 = call <vscale x 2 x i64> @llvm.speculative.load.nxv2i64.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %2 = call <vscale x 4 x i32> @llvm.speculative.load.nxv4i32.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %3 = call <vscale x 8 x i16> @llvm.speculative.load.nxv8i16.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %4 = call <vscale x 16 x i8> @llvm.speculative.load.nxv16i8.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %5 = call <vscale x 2 x double> @llvm.speculative.load.nxv2f64.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 4 x float> @llvm.speculative.load.nxv4f32.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 8 x float> @llvm.speculative.load.nxv8f32.p0(ptr %p)
+; NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SVE-LABEL: 'speculative_load_cost_scalable'
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call <vscale x 2 x i64> @llvm.speculative.load.nxv2i64.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <vscale x 4 x i32> @llvm.speculative.load.nxv4i32.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <vscale x 8 x i16> @llvm.speculative.load.nxv8i16.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <vscale x 16 x i8> @llvm.speculative.load.nxv16i8.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <vscale x 2 x double> @llvm.speculative.load.nxv2f64.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 4 x float> @llvm.speculative.load.nxv4f32.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 8 x float> @llvm.speculative.load.nxv8f32.p0(ptr %p)
+; SVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  ; Scalable vector types - invalid without SVE, valid with SVE if <= 16 bytes
+  call <vscale x 2 x i64> @llvm.speculative.load.nxv2i64.p0(ptr %p)
+  call <vscale x 4 x i32> @llvm.speculative.load.nxv4i32.p0(ptr %p)
+  call <vscale x 8 x i16> @llvm.speculative.load.nxv8i16.p0(ptr %p)
+  call <vscale x 16 x i8> @llvm.speculative.load.nxv16i8.p0(ptr %p)
+  call <vscale x 2 x double> @llvm.speculative.load.nxv2f64.p0(ptr %p)
+  call <vscale x 4 x float> @llvm.speculative.load.nxv4f32.p0(ptr %p)
+  call <vscale x 8 x float> @llvm.speculative.load.nxv8f32.p0(ptr %p)
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/speculative-load.ll b/llvm/test/Analysis/CostModel/X86/speculative-load.ll
new file mode 100644
index 0000000000000..edf6311d7cfc7
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/speculative-load.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64 < %s | FileCheck %s
+
+; X86 does not implement expandCanLoadSpeculatively, so speculative_load
+; should return invalid cost to prevent vectorizers from using it.
+
+define void @speculative_load_cost(ptr %p) {
+; CHECK-LABEL: 'speculative_load_cost'
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %1 = call i8 @llvm.speculative.load.i8.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %2 = call i16 @llvm.speculative.load.i16.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %3 = call i32 @llvm.speculative.load.i32.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %4 = call i64 @llvm.speculative.load.i64.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %5 = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <4 x float> @llvm.speculative.load.v4f32.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <2 x double> @llvm.speculative.load.v2f64.p0(ptr %p)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call i8 @llvm.speculative.load.i8.p0(ptr %p)
+  call i16 @llvm.speculative.load.i16.p0(ptr %p)
+  call i32 @llvm.speculative.load.i32.p0(ptr %p)
+  call i64 @llvm.speculative.load.i64.p0(ptr %p)
+  call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %p)
+  call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr %p)
+  call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %p)
+  call <4 x float> @llvm.speculative.load.v4f32.p0(ptr %p)
+  call <2 x double> @llvm.speculative.load.v2f64.p0(ptr %p)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/can-load-speculatively.ll b/llvm/test/CodeGen/AArch64/can-load-speculatively.ll
new file mode 100644
index 0000000000000..b6679f22b0989
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/can-load-speculatively.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=aarch64-unknown-linux-gnu -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+
+; Test that @llvm.can.load.speculatively is lowered to an alignment check
+; for power-of-2 sizes <= 16 bytes on AArch64, and returns false for larger sizes.
+; The 16-byte limit ensures correctness with MTE (memory tagging).
+; Note: non-power-of-2 constant sizes are rejected by the verifier.
+
+define i1 @can_load_speculatively_16(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_16(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+  ret i1 %can_load
+}
+
+
+define i1 @can_load_speculatively_8_ptr_aligned(ptr align 8 %ptr) {
+; CHECK-LABEL: @can_load_speculatively_8_ptr_aligned(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+  ret i1 %can_load
+}
+
+define i1 @can_load_speculatively_16_ptr_aligned(ptr align 16 %ptr) {
+; CHECK-LABEL: @can_load_speculatively_16_ptr_aligned(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+  ret i1 %can_load
+}
+
+define i1 @can_load_speculatively_16_ptr_aligned2(ptr align 16 %ptr) {
+; CHECK-LABEL: @can_load_speculatively_16_ptr_aligned2(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr align 16 %ptr, i64 16)
+  ret i1 %can_load
+}
+
+define i1 @can_load_speculatively_32_ptr_aligned(ptr align 32 %ptr) {
+; CHECK-LABEL: @can_load_speculatively_32_ptr_aligned(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 15
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+  ret i1 %can_load
+}
+
+; Size > 16 - returns false (may cross MTE tag granule boundary)
+define i1 @can_load_speculatively_32(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_32(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 32)
+  ret i1 %can_load
+}
+
+; Size > 16 - returns false (may cross MTE tag granule boundary)
+define i1 @can_load_speculatively_64(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_64(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 64)
+  ret i1 %can_load
+}
+
+; Test with address space
+define i1 @can_load_speculatively_addrspace1(ptr addrspace(1) %ptr) {
+; CHECK-LABEL: @can_load_speculatively_addrspace1(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p1(ptr addrspace(1) %ptr, i64 16)
+  ret i1 %can_load
+}
+
+; Test size 8 (within limit, power-of-2)
+define i1 @can_load_speculatively_8(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_8(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], 7
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[TMP3]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 8)
+  ret i1 %can_load
+}
+
+; Test with runtime size - checks size <= 16 and alignment
+define i1 @can_load_speculatively_runtime(ptr %ptr, i64 %size) {
+; CHECK-LABEL: @can_load_speculatively_runtime(
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[SIZE:%.*]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[SIZE]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    ret i1 [[TMP6]]
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 %size)
+  ret i1 %can_load
+}
+
+declare i1 @llvm.can.load.speculatively.p0(ptr, i64)
+declare i1 @llvm.can.load.speculatively.p1(ptr addrspace(1), i64)
diff --git a/llvm/test/CodeGen/AArch64/speculative-load-intrinsic-sve.ll b/llvm/test/CodeGen/AArch64/speculative-load-intrinsic-sve.ll
new file mode 100644
index 0000000000000..78a56f3539d11
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculative-load-intrinsic-sve.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Test that @llvm.speculative.load with scalable vectors is lowered to a
+; regular load in SelectionDAG.
+
+define <vscale x 4 x i32> @speculative_load_nxv4i32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i32> @llvm.speculative.load.nxv4i32.p0(ptr align 16 %ptr)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @speculative_load_nxv2i64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i64> @llvm.speculative.load.nxv2i64.p0(ptr %ptr)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 8 x i16> @speculative_load_nxv8i16(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 8 x i16> @llvm.speculative.load.nxv8i16.p0(ptr align 8 %ptr)
+  ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 16 x i8> @speculative_load_nxv16i8(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 16 x i8> @llvm.speculative.load.nxv16i8.p0(ptr %ptr)
+  ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 4 x float> @speculative_load_nxv4f32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x float> @llvm.speculative.load.nxv4f32.p0(ptr align 4 %ptr)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x double> @speculative_load_nxv2f64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x double> @llvm.speculative.load.nxv2f64.p0(ptr align 16 %ptr)
+  ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 4 x i32> @llvm.speculative.load.nxv4i32.p0(ptr)
+declare <vscale x 2 x i64> @llvm.speculative.load.nxv2i64.p0(ptr)
+declare <vscale x 8 x i16> @llvm.speculative.load.nxv8i16.p0(ptr)
+declare <vscale x 16 x i8> @llvm.speculative.load.nxv16i8.p0(ptr)
+declare <vscale x 4 x float> @llvm.speculative.load.nxv4f32.p0(ptr)
+declare <vscale x 2 x double> @llvm.speculative.load.nxv2f64.p0(ptr)
diff --git a/llvm/test/CodeGen/AArch64/speculative-load-intrinsic.ll b/llvm/test/CodeGen/AArch64/speculative-load-intrinsic.ll
new file mode 100644
index 0000000000000..8f9a17414ff05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/speculative-load-intrinsic.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; Test that @llvm.speculative.load is lowered to a regular load
+; in SelectionDAG, respecting the alignment attribute.
+
+define <4 x i32> @speculative_load_v4i32_align16(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v4i32_align16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr align 16 %ptr)
+  ret <4 x i32> %load
+}
+
+define <4 x i32> @speculative_load_v4i32_align4(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v4i32_align4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr align 4 %ptr)
+  ret <4 x i32> %load
+}
+
+define <4 x i32> @speculative_load_v4i32_noalign(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v4i32_noalign:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr %ptr)
+  ret <4 x i32> %load
+}
+
+define <8 x i32> @speculative_load_v8i32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ret
+  %load = call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr align 32 %ptr)
+  ret <8 x i32> %load
+}
+
+define <2 x i64> @speculative_load_v2i64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %ptr)
+  ret <2 x i64> %load
+}
+
+define <4 x float> @speculative_load_v4f32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <4 x float> @llvm.speculative.load.v4f32.p0(ptr align 8 %ptr)
+  ret <4 x float> %load
+}
+
+define <2 x double> @speculative_load_v2f64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ret
+  %load = call <2 x double> @llvm.speculative.load.v2f64.p0(ptr align 16 %ptr)
+  ret <2 x double> %load
+}
+
+declare <4 x i32> @llvm.speculative.load.v4i32.p0(ptr)
+declare <8 x i32> @llvm.speculative.load.v8i32.p0(ptr)
+declare <2 x i64> @llvm.speculative.load.v2i64.p0(ptr)
+declare <4 x float> @llvm.speculative.load.v4f32.p0(ptr)
+declare <2 x double> @llvm.speculative.load.v2f64.p0(ptr)
+
+; Scalar type tests
+
+define i32 @speculative_load_i32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w0, [x0]
+; CHECK-NEXT:    ret
+  %load = call i32 @llvm.speculative.load.i32.p0(ptr align 4 %ptr)
+  ret i32 %load
+}
+
+define i64 @speculative_load_i64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x0, [x0]
+; CHECK-NEXT:    ret
+  %load = call i64 @llvm.speculative.load.i64.p0(ptr %ptr)
+  ret i64 %load
+}
+
+define float @speculative_load_f32(ptr %ptr) {
+; CHECK-LABEL: speculative_load_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ret
+  %load = call float @llvm.speculative.load.f32.p0(ptr %ptr)
+  ret float %load
+}
+
+define double @speculative_load_f64(ptr %ptr) {
+; CHECK-LABEL: speculative_load_f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ret
+  %load = call double @llvm.speculative.load.f64.p0(ptr align 8 %ptr)
+  ret double %load
+}
+
+declare i32 @llvm.speculative.load.i32.p0(ptr)
+declare i64 @llvm.speculative.load.i64.p0(ptr)
+declare float @llvm.speculative.load.f32.p0(ptr)
+declare double @llvm.speculative.load.f64.p0(ptr)
diff --git a/llvm/test/CodeGen/X86/can-load-speculatively.ll b/llvm/test/CodeGen/X86/can-load-speculatively.ll
new file mode 100644
index 0000000000000..f51d3847e921d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/can-load-speculatively.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+
+; Test that @llvm.can.load.speculatively returns false (default) on X86,
+; as X86 does not provide a target-specific expansion.
+
+define i1 @can_load_speculatively_16(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_16(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 16)
+  ret i1 %can_load
+}
+
+define i1 @can_load_speculatively_32(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_32(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 32)
+  ret i1 %can_load
+}
+
+define i1 @can_load_speculatively_8(ptr %ptr) {
+; CHECK-LABEL: @can_load_speculatively_8(
+; CHECK-NEXT:    ret i1 false
+;
+  %can_load = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 8)
+  ret i1 %can_load
+}
+
+declare i1 @llvm.can.load.speculatively.p0(ptr, i64)
+
diff --git a/llvm/test/CodeGen/X86/speculative-load-intrinsic.ll b/llvm/test/CodeGen/X86/speculative-load-intrinsic.ll
new file mode 100644
index 0000000000000..798ee2c03b1ab
--- /dev/null
+++ b/llvm/test/CodeGen/X86/speculative-load-intrinsic.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.1 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+
+; Test that @llvm.speculative.load is lowered to a regular load
+; in SelectionDAG.
+
+define <4 x i32> @speculative_load_v4i32(ptr %ptr) {
+; SSE-LABEL: speculative_load_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %load = call <4 x i32> @llvm.speculative.load.v4i32.p0(ptr align 16 %ptr)
+  ret <4 x i32> %load
+}
+
+define <8 x i32> @speculative_load_v8i32(ptr %ptr) {
+; SSE-LABEL: speculative_load_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    movaps 16(%rdi), %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_v8i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    retq
+  %load = call <8 x i32> @llvm.speculative.load.v8i32.p0(ptr align 32 %ptr)
+  ret <8 x i32> %load
+}
+
+define <2 x i64> @speculative_load_v2i64(ptr %ptr) {
+; SSE-LABEL: speculative_load_v2i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_v2i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %load = call <2 x i64> @llvm.speculative.load.v2i64.p0(ptr %ptr)
+  ret <2 x i64> %load
+}
+
+define <4 x float> @speculative_load_v4f32(ptr %ptr) {
+; SSE-LABEL: speculative_load_v4f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_v4f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovups (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %load = call <4 x float> @llvm.speculative.load.v4f32.p0(ptr align 8 %ptr)
+  ret <4 x float> %load
+}
+
+define <2 x double> @speculative_load_v2f64(ptr %ptr) {
+; SSE-LABEL: speculative_load_v2f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_v2f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovaps (%rdi), %xmm0
+; AVX-NEXT:    retq
+  %load = call <2 x double> @llvm.speculative.load.v2f64.p0(ptr align 16 %ptr)
+  ret <2 x double> %load
+}
+
+declare <4 x i32> @llvm.speculative.load.v4i32.p0(ptr)
+declare <8 x i32> @llvm.speculative.load.v8i32.p0(ptr)
+declare <2 x i64> @llvm.speculative.load.v2i64.p0(ptr)
+declare <4 x float> @llvm.speculative.load.v4f32.p0(ptr)
+declare <2 x double> @llvm.speculative.load.v2f64.p0(ptr)
+
+; Scalar type tests
+
+define i32 @speculative_load_i32(ptr %ptr) {
+; SSE-LABEL: speculative_load_i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movl (%rdi), %eax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl (%rdi), %eax
+; AVX-NEXT:    retq
+  %load = call i32 @llvm.speculative.load.i32.p0(ptr %ptr)
+  ret i32 %load
+}
+
+define i64 @speculative_load_i64(ptr %ptr) {
+; SSE-LABEL: speculative_load_i64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_i64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    retq
+  %load = call i64 @llvm.speculative.load.i64.p0(ptr align 8 %ptr)
+  ret i64 %load
+}
+
+define float @speculative_load_f32(ptr %ptr) {
+; SSE-LABEL: speculative_load_f32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_f32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    retq
+  %load = call float @llvm.speculative.load.f32.p0(ptr align 4 %ptr)
+  ret float %load
+}
+
+define double @speculative_load_f64(ptr %ptr) {
+; SSE-LABEL: speculative_load_f64:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: speculative_load_f64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    retq
+  %load = call double @llvm.speculative.load.f64.p0(ptr %ptr)
+  ret double %load
+}
+
+declare i32 @llvm.speculative.load.i32.p0(ptr)
+declare i64 @llvm.speculative.load.i64.p0(ptr)
+declare float @llvm.speculative.load.f32.p0(ptr)
+declare double @llvm.speculative.load.f64.p0(ptr)
diff --git a/llvm/test/Verifier/can-load-speculatively.ll b/llvm/test/Verifier/can-load-speculatively.ll
new file mode 100644
index 0000000000000..d2d69f70cfb60
--- /dev/null
+++ b/llvm/test/Verifier/can-load-speculatively.ll
@@ -0,0 +1,19 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare i1 @llvm.can.load.speculatively.p0(ptr, i64)
+
+; Test that constant size must be a positive power of 2
+
+define i1 @test_size_zero(ptr %ptr) {
+; CHECK: llvm.can.load.speculatively size must be a positive power of 2
+; CHECK-NEXT: %res = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 0)
+  %res = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 0)
+  ret i1 %res
+}
+
+define i1 @test_non_power_of_2(ptr %ptr) {
+; CHECK: llvm.can.load.speculatively size must be a positive power of 2
+; CHECK-NEXT: %res = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 3)
+  %res = call i1 @llvm.can.load.speculatively.p0(ptr %ptr, i64 3)
+  ret i1 %res
+}
diff --git a/llvm/test/Verifier/speculative-load.ll b/llvm/test/Verifier/speculative-load.ll
new file mode 100644
index 0000000000000..def46d20799c5
--- /dev/null
+++ b/llvm/test/Verifier/speculative-load.ll
@@ -0,0 +1,18 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare <3 x i32> @llvm.speculative.load.v3i32.p0(ptr)
+declare <vscale x 3 x i32> @llvm.speculative.load.nxv3i32.p0(ptr)
+
+define <3 x i32> @test_non_power_of_2_fixed(ptr %ptr) {
+; CHECK: llvm.speculative.load type must have a power-of-2 size
+; CHECK-NEXT: %res = call <3 x i32> @llvm.speculative.load.v3i32.p0(ptr %ptr)
+  %res = call <3 x i32> @llvm.speculative.load.v3i32.p0(ptr %ptr)
+  ret <3 x i32> %res
+}
+
+define <vscale x 3 x i32> @test_non_power_of_2_scalable(ptr %ptr) {
+; CHECK: llvm.speculative.load type must have a power-of-2 size
+; CHECK-NEXT: %res = call <vscale x 3 x i32> @llvm.speculative.load.nxv3i32.p0(ptr %ptr)
+  %res = call <vscale x 3 x i32> @llvm.speculative.load.nxv3i32.p0(ptr %ptr)
+  ret <vscale x 3 x i32> %res
+}