Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,12 @@ struct PointerDiffInfo {
const SCEV *SinkStart;
unsigned AccessSize;
bool NeedsFreeze;
bool WriteAfterRead;

PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart,
unsigned AccessSize, bool NeedsFreeze)
unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead)
: SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize),
NeedsFreeze(NeedsFreeze) {}
NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {}
};

/// Holds information about the memory runtime legality checks to verify
Expand Down
12 changes: 12 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@ enum class TailFoldingStyle {
DataWithEVL,
};

enum class RTCheckStyle {
/// Create runtime checks based on the difference between two pointers
ScalarDifference,
/// Form a mask based on elements which won't be a WAR or RAW hazard.
UseSafeEltsMask,
};

struct TailFoldingInfo {
TargetLibraryInfo *TLI;
LoopVectorizationLegality *LVL;
Expand Down Expand Up @@ -1357,6 +1364,11 @@ class TargetTransformInfo {
PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const;

/// \return true if a mask should be formed that disables lanes that could
/// alias between two pointers. The mask is created by the
/// loop_dependence_{war,raw}_mask intrinsics.
LLVM_ABI bool useSafeEltsMask() const;

/// \return The maximum interleave factor that any transform should try to
/// perform for this target. This number depends on the level of parallelism
/// and the number of execution units in the CPU.
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,8 @@ class TargetTransformInfoImplBase {
return InstructionCost::getInvalid();
}

virtual bool useSafeEltsMask() const { return false; }

virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }

virtual InstructionCost getArithmeticInstrCost(
Expand Down
47 changes: 47 additions & 0 deletions llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2190,6 +2190,53 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
// Otherwise, fallback to default scalarization cost.
break;
}
case Intrinsic::loop_dependence_raw_mask:
case Intrinsic::loop_dependence_war_mask: {
InstructionCost Cost = 0;
Type *PtrTy = ICA.getArgTypes()[0];
bool IsReadAfterWrite = IID == Intrinsic::loop_dependence_raw_mask;

Cost +=
thisT()->getArithmeticInstrCost(Instruction::Sub, PtrTy, CostKind);
if (IsReadAfterWrite) {
IntrinsicCostAttributes AbsAttrs(Intrinsic::abs, PtrTy, {PtrTy}, {});
Cost += thisT()->getIntrinsicInstrCost(AbsAttrs, CostKind);
}

Cost +=
thisT()->getArithmeticInstrCost(Instruction::SDiv, PtrTy, CostKind);
Type *CmpTy =
getTLI()
->getSetCCResultType(
thisT()->getDataLayout(), RetTy->getContext(),
getTLI()->getValueType(thisT()->getDataLayout(), PtrTy))
.getTypeForEVT(RetTy->getContext());
Cost += thisT()->getCmpSelInstrCost(
BinaryOperator::ICmp, CmpTy, PtrTy,
IsReadAfterWrite ? CmpInst::ICMP_EQ : CmpInst::ICMP_SLE, CostKind);

// The deconstructed active lane mask
VectorType *RetTyVec = cast<VectorType>(RetTy);
VectorType *SplatTy = cast<VectorType>(RetTyVec->getWithNewType(PtrTy));
Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SplatTy, SplatTy, {},
CostKind, 0, nullptr);
IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, SplatTy, {},
FMF);
Cost += thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SplatTy,
SplatTy, CmpInst::ICMP_ULT, CostKind);

Cost +=
thisT()->getCastInstrCost(Instruction::CastOps::ZExt, RetTy, SplatTy,
TTI::CastContextHint::None, CostKind);
Cost += thisT()->getCastInstrCost(Instruction::CastOps::ZExt,
RetTyVec->getElementType(), CmpTy,
TTI::CastContextHint::None, CostKind);
Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, RetTyVec, RetTyVec, {},
CostKind, 0, nullptr);
Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
return Cost;
}
}

// Assume that we need to scalarize this intrinsic.)
Expand Down
5 changes: 4 additions & 1 deletion llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,11 +511,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck(
}
}

bool WriteAfterRead = !Src->IsWritePtr && Sink->IsWritePtr;

LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n"
<< "SrcStart: " << *SrcStartInt << '\n'
<< "SinkStartInt: " << *SinkStartInt << '\n');
DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize,
Src->NeedsFreeze || Sink->NeedsFreeze);
Src->NeedsFreeze || Sink->NeedsFreeze,
WriteAfterRead);
return true;
}

Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -878,6 +878,10 @@ InstructionCost TargetTransformInfo::getPartialReductionCost(
BinOp, CostKind);
}

bool TargetTransformInfo::useSafeEltsMask() const {
return TTIImpl->useSafeEltsMask();
}

unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
return TTIImpl->getMaxInterleaveFactor(VF);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Analysis/VectorUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
return (ScalarOpdIdx == 2);
case Intrinsic::experimental_vp_splice:
return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
case Intrinsic::loop_dependence_war_mask:
case Intrinsic::loop_dependence_raw_mask:
return true;
default:
return false;
}
Expand Down
39 changes: 39 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,40 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
}
break;
}
case Intrinsic::loop_dependence_raw_mask:
case Intrinsic::loop_dependence_war_mask: {
auto *EltSize = cast<ConstantInt>(ICA.getArgs()[2]);
EVT VecVT = getTLI()->getValueType(DL, RetTy);
// An invalid element size and return type combination must be expanded.
bool MustBeExpanded = false;
switch (EltSize->getSExtValue()) {
case 1:
if (VecVT != MVT::v16i1 && VecVT != MVT::nxv16i1)
MustBeExpanded = true;
break;
case 2:
if (VecVT != MVT::v8i1 && VecVT != MVT::nxv8i1)
MustBeExpanded = true;
break;
case 4:
if (VecVT != MVT::v4i1 && VecVT != MVT::nxv4i1)
MustBeExpanded = true;
break;
case 8:
if (VecVT != MVT::v2i1 && VecVT != MVT::nxv2i1)
MustBeExpanded = true;
break;
default:
MustBeExpanded = true;
// Other element sizes are incompatible with whilewr/rw, so expand instead
break;
}

// The whilewr/rw instructions require SVE2 or SME
if (MustBeExpanded || (!ST->hasSVE2() && !ST->hasSME()))
break;
return 1;
}
case Intrinsic::experimental_vector_extract_last_active:
if (ST->isSVEorStreamingSVEAvailable()) {
auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
Expand Down Expand Up @@ -5846,6 +5880,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
return Cost + 2;
}

bool AArch64TTIImpl::useSafeEltsMask() const {
// The whilewr/rw instructions require SVE2
return ST->hasSVE2() || ST->hasSME();
}

InstructionCost
AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
VectorType *SrcTy, ArrayRef<int> Mask,
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
TTI::TargetCostKind CostKind) const override;

bool useSafeEltsMask() const override;

bool enableOrderedReductions() const override { return true; }

InstructionCost getInterleavedMemoryOpCost(
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2149,7 +2149,7 @@ Value *llvm::addDiffRuntimeChecks(
// Map to keep track of created compares, The key is the pair of operands for
// the compare, to allow detecting and re-using redundant compares.
DenseMap<std::pair<Value *, Value *>, Value *> SeenCompares;
for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) {
for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze, _] : Checks) {
Type *Ty = SinkStart->getType();
// Compute VF * IC * AccessSize.
auto *VFTimesICTimesSize =
Expand Down
107 changes: 105 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized");
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");

static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
Expand Down Expand Up @@ -1333,6 +1334,12 @@ class LoopVectorizationCostModel {
: ChosenTailFoldingStyle->second;
}

RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const {
if (TTI.useSafeEltsMask())
return RTCheckStyle::UseSafeEltsMask;
return RTCheckStyle::ScalarDifference;
}

/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
/// \param IsScalableVF true if scalable vector factors enabled.
Expand Down Expand Up @@ -8554,6 +8561,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
bool WithoutRuntimeCheck =
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
WithoutRuntimeCheck);
}
Expand Down Expand Up @@ -8974,11 +8982,104 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
assert((!CM.OptForSize ||
CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
"Cannot SCEV check stride or overflow when optimizing for size");
VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock,
VPlanTransforms::attachCheckBlock(Plan, Plan.getOrAddLiveIn(SCEVCheckCond),
Plan.createVPIRBasicBlock(SCEVCheckBlock),
HasBranchWeights);
}
const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) {
VPValue *MemCheckCondVPV = Plan.getOrAddLiveIn(MemCheckCond);
VPBasicBlock *MemCheckBlockVP = Plan.createVPIRBasicBlock(MemCheckBlock);
std::optional<ArrayRef<PointerDiffInfo>> ChecksOpt =
CM.Legal->getRuntimePointerChecking()->getDiffChecks();

// Create a mask enabling safe elements for each iteration.
if (CM.getRTCheckStyle(TTI) == RTCheckStyle::UseSafeEltsMask &&
ChecksOpt.has_value() && ChecksOpt->size() > 0) {
ArrayRef<PointerDiffInfo> Checks = *ChecksOpt;
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *LoopBody = LoopRegion->getEntryBasicBlock();
VPBuilder Builder(MemCheckBlockVP);

/// Create a mask for each possibly-aliasing pointer pair, ANDing them if
/// there's more than one pair.
VPValue *AliasMask = nullptr;
for (PointerDiffInfo Check : Checks) {
VPValue *Sink =
vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart);
VPValue *Src =
vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart);

Type *PtrType = PointerType::getUnqual(Plan.getContext());
Sink = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink,
PtrType, DebugLoc());
Src = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src,
PtrType, DebugLoc());

SmallVector<VPValue *, 3> Ops{
Src, Sink,
Plan.getConstantInt(IntegerType::getInt64Ty(Plan.getContext()),
Check.AccessSize)};
VPWidenIntrinsicRecipe *M = new VPWidenIntrinsicRecipe(
Check.WriteAfterRead ? Intrinsic::loop_dependence_war_mask
: Intrinsic::loop_dependence_raw_mask,
Ops, IntegerType::getInt1Ty(Plan.getContext()));
MemCheckBlockVP->appendRecipe(M);
if (AliasMask)
AliasMask = Builder.createAnd(AliasMask, M);
else
AliasMask = M;
}
assert(AliasMask && "Expected an alias mask to have been created");

// Replace uses of the loop body's active lane mask phi with an AND of the
// phi and the alias mask.
for (VPRecipeBase &R : *LoopBody) {
auto *MaskPhi = dyn_cast<VPActiveLaneMaskPHIRecipe>(&R);
if (!MaskPhi)
continue;
VPInstruction *And = new VPInstruction(Instruction::BinaryOps::And,
{MaskPhi, AliasMask});
MaskPhi->replaceUsesWithIf(And, [And](VPUser &U, unsigned) {
auto *UR = dyn_cast<VPRecipeBase>(&U);
// If this is the first user, insert the AND.
if (UR && !And->getParent())
And->insertBefore(UR);
bool Replace = UR != And;
return Replace;
});
}

// An empty mask would cause an infinite loop since the induction variable
// is updated with the number of set elements in the mask. Make sure we
// don't execute the vector loop when the mask is empty.
VPInstruction *PopCount =
new VPInstruction(VPInstruction::PopCount, {AliasMask});
PopCount->insertAfter(AliasMask->getDefiningRecipe());
VPValue *Cmp =
Builder.createICmp(CmpInst::Predicate::ICMP_EQ, PopCount,
Plan.getOrAddLiveIn(ConstantInt::get(
IntegerType::get(Plan.getContext(), 64), 0)));
MemCheckCondVPV = Cmp;

// Update the IV by the number of active lanes in the mask.
auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());

// Increment phi by correct amount.
VPValue *IncrementBy = PopCount;
Type *IVType = CanonicalIVPHI->getScalarType();

if (IVType->getScalarSizeInBits() < 64) {
Builder.setInsertPoint(CanonicalIVIncrement);
IncrementBy =
Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType,
CanonicalIVIncrement->getDebugLoc());
}
CanonicalIVIncrement->setOperand(1, IncrementBy);
}

// VPlan-native path does not do any analysis for runtime checks
// currently.
assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
Expand All @@ -8999,7 +9100,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks(
"(e.g., adding 'restrict').";
});
}
VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock,
VPlanTransforms::attachCheckBlock(Plan, MemCheckCondVPV, MemCheckBlockVP,
HasBranchWeights);
}
}
Expand Down Expand Up @@ -9966,6 +10067,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1) {
if (CM.getRTCheckStyle(*TTI) == RTCheckStyle::UseSafeEltsMask)
LoopsAliasMasked++;
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);

// Bail out early if either the SCEV or memory runtime checks are known to
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1030,6 +1030,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
// during unrolling.
ExtractPenultimateElement,
LogicalAnd, // Non-poison propagating logical And.
PopCount,
// Add an offset in bytes (second operand) to a base pointer (first
// operand). Only generates scalar values (either for the first lane only or
// for all lanes, depending on its uses).
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
case VPInstruction::BranchOnCond:
case VPInstruction::BranchOnCount:
return Type::getVoidTy(Ctx);
case VPInstruction::PopCount:
return Type::getInt64Ty(Ctx);
default:
break;
}
Expand Down
6 changes: 2 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,11 +640,9 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) {
// including memory overlap checks block and wrapping/unit-stride checks block.
static constexpr uint32_t CheckBypassWeights[] = {1, 127};

void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
BasicBlock *CheckBlock,
void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *CondVPV,
VPBasicBlock *CheckBlockVPBB,
bool AddBranchWeights) {
VPValue *CondVPV = Plan.getOrAddLiveIn(Cond);
VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock);
VPBlockBase *VectorPH = Plan.getVectorPreheader();
VPBlockBase *ScalarPH = Plan.getScalarPreheader();
VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor();
Expand Down
Loading
Loading