llvm · arcbbb · Dec 3, 2025 · Dec 3, 2025 · Dec 16, 2025 · lukel97
diff --git a/llvm/include/llvm/IR/VectorTypeUtils.h b/llvm/include/llvm/IR/VectorTypeUtils.h
@@ -14,6 +14,11 @@
 
 namespace llvm {
 
+/// Returns true if \p IID is a vector intrinsic that returns a struct with a
+/// scalar element at index \p EleIdx.
+LLVM_ABI bool isVectorIntrinsicWithStructReturnScalarAtField(unsigned IID,
+                                                             unsigned EleIdx);
+
 /// A helper function for converting Scalar types to vector types. If
 /// the incoming type is void, we return void. If the EC represents a
 /// scalar, we return the scalar type.
@@ -31,7 +36,11 @@ inline Type *toVectorTy(Type *Scalar, unsigned VF) {
 /// Note:
 ///   - If \p EC is scalar, \p StructTy is returned unchanged
 ///   - Only unpacked literal struct types are supported
-LLVM_ABI Type *toVectorizedStructTy(StructType *StructTy, ElementCount EC);
+///   vector types.
+///   - If IID (Intrinsic ID) is provided, only fields that are vector types
+///   are widened.
+LLVM_ABI Type *toVectorizedStructTy(StructType *StructTy, ElementCount EC,
+                                    unsigned IID = 0);
 
 /// A helper for converting structs of vector types to structs of scalar types.
 /// Note: Only unpacked literal struct types are supported.
@@ -52,9 +61,9 @@ LLVM_ABI bool canVectorizeStructTy(StructType *StructTy);
 ///   - If the incoming type is void, we return void
 ///   - If \p EC is scalar, \p Ty is returned unchanged
 ///   - Only unpacked literal struct types are supported
-inline Type *toVectorizedTy(Type *Ty, ElementCount EC) {
+inline Type *toVectorizedTy(Type *Ty, ElementCount EC, unsigned IID = 0) {
   if (StructType *StructTy = dyn_cast<StructType>(Ty))
-    return toVectorizedStructTy(StructTy, EC);
+    return toVectorizedStructTy(StructTy, EC, IID);
   return toVectorTy(Ty, EC);
 }
 

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
@@ -175,6 +175,8 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
     return (ScalarOpdIdx == 2);
   case Intrinsic::experimental_vp_splice:
     return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
+  case Intrinsic::vp_load_ff:
+    return ScalarOpdIdx == 0 || ScalarOpdIdx == 2;
   default:
     return false;
   }
@@ -212,6 +214,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   case Intrinsic::powi:
   case Intrinsic::ldexp:
     return OpdIdx == -1 || OpdIdx == 1;
+  case Intrinsic::vp_load_ff:
+    return OpdIdx == 0;
   default:
     return OpdIdx == -1;
   }
@@ -224,6 +228,10 @@ bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
     return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
 
   switch (ID) {
+  case Intrinsic::modf:
+  case Intrinsic::sincos:
+  case Intrinsic::sincospi:
+    return false;
   case Intrinsic::frexp:
     return RetIdx == 0 || RetIdx == 1;
   default:

diff --git a/llvm/lib/IR/VectorTypeUtils.cpp b/llvm/lib/IR/VectorTypeUtils.cpp
@@ -8,12 +8,21 @@
 
 #include "llvm/IR/VectorTypeUtils.h"
 #include "llvm/ADT/SmallVectorExtras.h"
+#include "llvm/IR/Intrinsics.h"
 
 using namespace llvm;
 
+bool llvm::isVectorIntrinsicWithStructReturnScalarAtField(unsigned IID,
+                                                          unsigned EleIdx) {
+  if (IID == Intrinsic::vp_load_ff)
+    return EleIdx == 1;
+  return false;
+}
+
 /// A helper for converting structs of scalar types to structs of vector types.
 /// Note: Only unpacked literal struct types are supported.
-Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC) {
+Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC,
+                                 unsigned IID) {
   if (EC.isScalar())
     return StructTy;
   assert(isUnpackedStructLiteral(StructTy) &&
@@ -22,7 +31,10 @@ Type *llvm::toVectorizedStructTy(StructType *StructTy, ElementCount EC) {
          "expected all element types to be valid vector element types");
   return StructType::get(
       StructTy->getContext(),
-      map_to_vector(StructTy->elements(), [&](Type *ElTy) -> Type * {
+      map_to_vector(enumerate(StructTy->elements()), [&](auto It) -> Type * {
+        Type *ElTy = It.value();
+        if (isVectorIntrinsicWithStructReturnScalarAtField(IID, It.index()))
+          return ElTy;
         return VectorType::get(ElTy, EC);
       }));
 }

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -399,6 +399,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
     cl::desc(
         "Enable vectorization of early exit loops with uncountable exits."));
 
+static cl::opt<bool>
+    EnableEarlyExitWithFFLoads("enable-early-exit-with-ffload", cl::init(false),
+                               cl::Hidden,
+                               cl::desc("Enable vectorization of early-exit "
+                                        "loops with fault-only-first loads."));
+
 static cl::opt<bool> ConsiderRegPressure(
     "vectorizer-consider-reg-pressure", cl::init(false), cl::Hidden,
     cl::desc("Discard VFs if their register pressure is too high."));
@@ -3551,6 +3557,15 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     return FixedScalableVFPair::getNone();
   }
 
+  if (!Legal->getPotentiallyFaultingLoads().empty() && UserIC > 1) {
+    reportVectorizationFailure("Auto-vectorization of loops with potentially "
+                               "faulting loads is not supported when the "
+                               "interleave count is more than 1",
+                               "CantInterleaveLoopWithPotentiallyFaultingLoads",
+                               ORE, TheLoop);
+    return FixedScalableVFPair::getNone();
+  }
+
   ScalarEvolution *SE = PSE.getSE();
   ElementCount TC = getSmallConstantTripCount(SE, TheLoop);
   unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
@@ -4163,7 +4178,11 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
       Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
       if (!Visited.insert({ScalarTy}).second)
         continue;
-      Type *WideTy = toVectorizedTy(ScalarTy, VF);
+      unsigned IID = 0;
+      if (auto *WI = dyn_cast<VPWidenIntrinsicRecipe>(&R))
+        WI->getVectorIntrinsicID();
+      Type *WideTy = toVectorizedTy(ScalarTy, VF, IID);
+
       if (any_of(getContainedTypes(WideTy), WillGenerateTargetVectors))
         return true;
     }
@@ -4626,6 +4645,10 @@ LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
   if (!Legal->isSafeForAnyVectorWidth())
     return 1;
 
+  // No interleaving for potentially faulting loads.
+  if (!Legal->getPotentiallyFaultingLoads().empty())
+    return 1;
+
   // We don't attempt to perform interleaving for loops with uncountable early
   // exits because the VPInstruction::AnyOf code cannot currently handle
   // multiple parts.
@@ -7382,6 +7405,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   // Regions are dissolved after optimizing for VF and UF, which completely
   // removes unneeded loop regions first.
   VPlanTransforms::dissolveLoopRegions(BestVPlan);
+
+  VPlanTransforms::convertFFLoadEarlyExitToVLStepping(BestVPlan);
+
   // Canonicalize EVL loops after regions are dissolved.
   VPlanTransforms::canonicalizeEVLLoops(BestVPlan);
   VPlanTransforms::materializeBackedgeTakenCount(BestVPlan, VectorPH);
@@ -7610,8 +7636,8 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
   });
 }
 
-VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
-                                                       VFRange &Range) {
+VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
+                                                VFRange &Range) {
   assert((VPI->getOpcode() == Instruction::Load ||
           VPI->getOpcode() == Instruction::Store) &&
          "Must be called with either a load or store");
@@ -7672,6 +7698,23 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
     Builder.insert(VectorPtr);
     Ptr = VectorPtr;
   }
+
+  if (Legal->getPotentiallyFaultingLoads().contains(I)) {
+    auto *I32Ty = IntegerType::getInt32Ty(Plan.getContext());
+    auto *RetTy = StructType::get(I->getType(), I32Ty);
+    DebugLoc DL = I->getDebugLoc();
+    if (!Mask)
+      Mask = Plan.getOrAddLiveIn(
+          ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+    auto *FFLoad = new VPWidenIntrinsicRecipe(Intrinsic::vp_load_ff,
+                                              {Ptr, Mask, &Plan.getVF()}, RetTy,
+                                              *VPI, *VPI, DL);
+    Builder.insert(FFLoad);
+    VPValue *Zero = Plan.getConstantInt(32, 0);
+    return new VPInstruction(VPInstruction::ExtractVectorValue, {FFLoad, Zero},
+                             {}, {}, DL);
+  }
+
   if (VPI->getOpcode() == Instruction::Load) {
     auto *Load = cast<LoadInst>(I);
     return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
@@ -8617,6 +8660,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   if (!VPlanTransforms::runPass(VPlanTransforms::handleMultiUseReductions,
                                 *Plan))
     return nullptr;
+
+  VPlanTransforms::adjustFFLoadEarlyExitForPoisonSafety(*Plan);
+
   // Apply mandatory transformation to handle FP maxnum/minnum reduction with
   // NaNs if possible, bail out otherwise.
   if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
@@ -9948,7 +9994,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  if (!LVL.getPotentiallyFaultingLoads().empty()) {
+  if (EnableEarlyExitWithFFLoads) {
+    if (LVL.getPotentiallyFaultingLoads().size() > 1) {
+      reportVectorizationFailure("Auto-vectorization of loops with more than 1 "
+                                 "potentially faulting load is not enabled",
+                                 "MoreThanOnePotentiallyFaultingLoad", ORE, L);
+      return false;
+    }
+  } else if (!LVL.getPotentiallyFaultingLoads().empty()) {
     reportVectorizationFailure("Auto-vectorization of loops with potentially "
                                "faulting load is not supported",
                                "PotentiallyFaultingLoadsNotSupported", ORE, L);

diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -92,7 +92,7 @@ class VPRecipeBuilder {
   /// Check if the load or store instruction \p VPI should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPWidenMemoryRecipe *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
+  VPRecipeBase *tryToWidenMemory(VPInstruction *VPI, VFRange &Range);
 
   /// Check if an induction recipe should be constructed for \p VPI. If so build
   /// and return it. If not, return null.

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1125,6 +1125,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
     /// The lane specifies an index into a vector formed by combining all vector
     /// operands (all operands after the first one).
     ExtractLane,
+    //  Extracts a scalar value from an aggregate value.
+    ExtractScalarValue,
+    //  Extracts a vector value from an aggregate value.
+    ExtractVectorValue,
     /// Explicit user for the resume phi of the canonical induction in the main
     /// VPlan, used by the epilogue vector loop.
     ResumeForEpilogue,

diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -141,6 +141,13 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case VPInstruction::BranchOnCond:
   case VPInstruction::BranchOnCount:
     return Type::getVoidTy(Ctx);
+  case VPInstruction::ExtractScalarValue:
+  case VPInstruction::ExtractVectorValue: {
+    assert(R->getNumOperands() == 2 && "expected single level extractvalue");
+    auto *StructTy = cast<StructType>(inferScalarType(R->getOperand(0)));
+    auto *CI = cast<ConstantInt>(R->getOperand(1)->getLiveInIRValue());
+    return StructTy->getTypeAtIndex(CI->getZExtValue());
+  }
   default:
     break;
   }

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h
@@ -237,7 +237,8 @@ struct VPTransformState {
       set(Def, V, VPLane(0));
       return;
     }
-    assert((VF.isScalar() || isVectorizedTy(V->getType())) &&
+    assert((VF.isScalar() || isVectorizedTy(V->getType()) ||
+            V->getType()->isStructTy()) &&
            "scalar values must be stored as (0, 0)");
     Data.VPV2Vector[Def] = V;
   }

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -453,6 +453,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
   case VPInstruction::BranchOnCount:
   case VPInstruction::ComputeReductionResult:
   case VPInstruction::ExtractLane:
+  case VPInstruction::ExtractScalarValue:
+  case VPInstruction::ExtractVectorValue:
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::LogicalAnd:
   case VPInstruction::PtrAdd:
@@ -832,6 +834,13 @@ Value *VPInstruction::generate(VPTransformState &State) {
       Res->setName(Name);
     return Res;
   }
+  case VPInstruction::ExtractVectorValue:
+  case VPInstruction::ExtractScalarValue: {
+    assert(getNumOperands() == 2 && "expected single level extractvalue");
+    Value *Op = State.get(getOperand(0));
+    auto *CI = cast<ConstantInt>(getOperand(1)->getLiveInIRValue());
+    return Builder.CreateExtractValue(Op, CI->getZExtValue());
+  }
   case VPInstruction::LogicalAnd: {
     Value *A = State.get(getOperand(0));
     Value *B = State.get(getOperand(1));
@@ -1138,6 +1147,7 @@ bool VPInstruction::isVectorToScalar() const {
 bool VPInstruction::isSingleScalar() const {
   switch (getOpcode()) {
   case Instruction::PHI:
+  case VPInstruction::ExtractScalarValue:
   case VPInstruction::ExplicitVectorLength:
   case VPInstruction::ResumeForEpilogue:
   case VPInstruction::VScale:
@@ -1349,6 +1359,12 @@ void VPInstruction::printRecipe(raw_ostream &O, const Twine &Indent,
   case VPInstruction::ExtractPenultimateElement:
     O << "extract-penultimate-element";
     break;
+  case VPInstruction::ExtractScalarValue:
+    O << "extract-scalar-value";
+    break;
+  case VPInstruction::ExtractVectorValue:
+    O << "extract-vector-value";
+    break;
   case VPInstruction::ComputeAnyOfResult:
     O << "compute-anyof-result";
     break;
@@ -1695,7 +1711,16 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
 
   SmallVector<Type *, 2> TysForDecl;
   // Add return type if intrinsic is overloaded on it.
-  if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1, State.TTI))
+  if (ResultTy->isStructTy()) {
+    auto *StructTy = cast<StructType>(ResultTy);
+    for (unsigned I = 0, E = StructTy->getNumElements(); I != E; ++I) {
+      if (isVectorIntrinsicWithStructReturnOverloadAtField(VectorIntrinsicID, I,
+                                                           State.TTI))
+        TysForDecl.push_back(
+            toVectorizedTy(StructTy->getStructElementType(I), State.VF));
+    }
+  } else if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1,
+                                                    State.TTI))
     TysForDecl.push_back(VectorType::get(getResultType(), State.VF));
   SmallVector<Value *, 4> Args;
   for (const auto &I : enumerate(operands())) {
@@ -1760,7 +1785,8 @@ static InstructionCost getCostForIntrinsics(Intrinsic::ID ID,
   }
 
   Type *ScalarRetTy = Ctx.Types.inferScalarType(&R);
-  Type *RetTy = VF.isVector() ? toVectorizedTy(ScalarRetTy, VF) : ScalarRetTy;
+  Type *RetTy =
+      VF.isVector() ? toVectorizedTy(ScalarRetTy, VF, ID) : ScalarRetTy;
   SmallVector<Type *> ParamTys;
   for (const VPValue *Op : Operands) {
     ParamTys.push_back(VF.isVector()