Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -1211,7 +1211,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
ActiveLaneMask,
ExplicitVectorLength,
CalculateTripCountMinusVF,
// Increment the canonical IV separately for each unrolled part.
// Increment the canonical IV separately for each unrolled part. Meant to be
// constructed with two operands, namely the Start value and VF. Unrolling
// and conversion to concrete recipes add an extra Offset operand and the
// recipe produces `add Start, Offset`. The offset for unrolled part 0 is 0.
CanonicalIVIncrementForPart,
// Abstract instruction that compares two values and branches. This is
// lowered to ICmp + BranchOnCond during VPlan to VPlan transformation.
Expand Down Expand Up @@ -2192,8 +2195,9 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags {
};

/// A recipe to compute the pointers for widened memory accesses of \p
/// SourceElementTy. Unrolling adds an extra offset operand for unrolled parts >
/// 0 and it produces `GEP Ptr, Offset`. The offset for unrolled part 0 is 0.
/// SourceElementTy. Unrolling and conversion to concrete recipes add an extra
/// offset operand and the recipe produces `GEP Ptr, Offset`. The offset for
/// unrolled part 0 is 0.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
Type *SourceElementTy;

Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,17 @@ inline auto m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) {
return m_c_Select(Op0, m_True(), Op1);
}

inline auto m_CanonicalIVIncrement() {
return VPInstruction_match<VPInstruction::CanonicalIVIncrementForPart>();
}

template <typename Op0_t, typename Op1_t, typename Op2_t>
inline auto m_CanonicalIVIncrement(const Op0_t &Op0, const Op1_t &Op1,
const Op2_t &Op2) {
return VPInstruction_match<VPInstruction::CanonicalIVIncrementForPart, Op0_t,
Op1_t, Op2_t>(Op0, Op1, Op2);
}

inline auto m_CanonicalIV() { return class_match<VPCanonicalIVPHIRecipe>(); }

template <typename Op0_t, typename Op1_t, typename Op2_t>
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -656,8 +656,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
return EVL;
}
case VPInstruction::CanonicalIVIncrementForPart: {
assert(getNumOperands() == 3 &&
"Expected prior simplification of recipe without offset");
auto *IV = State.get(getOperand(0), VPLane(0));
auto *VFxPart = State.get(getOperand(1), VPLane(0));
auto *VFxPart = State.get(getOperand(2), VPLane(0));
// The canonical IV is incremented by the vectorization factor (num of
// SIMD elements) times the unroll part.
return Builder.CreateAdd(IV, VFxPart, Name, hasNoUnsignedWrap(),
Expand Down
44 changes: 33 additions & 11 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1620,12 +1620,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
}
}

// Simplify unrolled VectorPointer without offset, or with zero offset, to
// just the pointer operand.
// Simplify unrolled VectorPointer with no offset or zero offset.
if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
return VPR->replaceAllUsesWith(VPR->getOperand(0));

// Simplify unrolled CanonicalIVIncrement with no offset or zero offset.
if (match(Def, m_CanonicalIVIncrement()))
if (Def->getNumOperands() == 2 || match(Def->getOperand(2), m_ZeroInt()))
return Def->replaceAllUsesWith(Def->getOperand(0));

// VPScalarIVSteps after unrolling can be replaced by their start value, if
// the start index is zero and only the first lane 0 is demanded.
if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
Expand Down Expand Up @@ -2064,18 +2068,18 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue()));
assert(Index && "Expected index from ActiveLaneMask instruction");

uint64_t Part;
if (match(Index,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>(
m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part)))))
Phis[Part] = Phi;
else {
// Anything other than a CanonicalIVIncrementForPart is part 0
assert(!match(
Index,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()));
m_CanonicalIVIncrement(m_VPValue(), m_VPValue(), m_ZeroInt()))) {
// Part 0 CanonicalIVIncrement.
Phis[0] = Phi;
continue;
}
uint64_t Part;
VPValue *VF;
assert(match(Index, m_CanonicalIVIncrement(
m_VPValue(), m_VPValue(VF),
m_c_Mul(m_Deferred(VF), m_ConstantInt(Part)))));
Phis[Part] = Phi;
}

assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
Expand Down Expand Up @@ -3986,6 +3990,24 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
ToRemove.push_back(Blend);
}

// Materialize Part0 offsets for various recipes when UF = 1. When UF > 1,
// the unroller would have added Part0 offsets.
if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
if (!VPR->getOffset()) {
assert(Plan.getConcreteUF() == 1 &&
"Expected unroller to have materialized offset for UF != 1");
VPR->addOperand(Plan.getZero(Plan.getDataLayout().getIndexType(
TypeInfo.inferScalarType(VPR))));
}
}
if (match(&R, m_CanonicalIVIncrement())) {
if (R.getNumOperands() == 2) {
assert(Plan.getConcreteUF() == 1 &&
"Expected unroller to have materialized offset for UF != 1");
R.addOperand(
Plan.getZero(Plan.getVectorLoopRegion()->getCanonicalIVType()));
}
}
if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
if (!VEPR->getOffset()) {
assert(Plan.getConcreteUF() == 1 &&
Expand Down
46 changes: 19 additions & 27 deletions llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
}
if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R)) {
VPBuilder Builder(VPR);
const DataLayout &DL = Plan.getDataLayout();
Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR));
Type *IndexTy =
Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR));
Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF());
VPValue *VF = Builder.createScalarZExtOrTrunc(
&Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown());
Expand All @@ -350,6 +350,16 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
Copy->addOperand(VFxPart);
continue;
}
if (match(&R, m_CanonicalIVIncrement())) {
VPBuilder Builder(&R);
VPValue *VFxPart = Builder.createOverflowingOp(
Instruction::Mul, {&Plan.getVF(), getConstantInt(Part)},
{true, true});
Copy->setOperand(0, R.getOperand(0));
Copy->setOperand(1, R.getOperand(1));
Copy->addOperand(VFxPart);
continue;
}
if (auto *Red = dyn_cast<VPReductionRecipe>(&R)) {
auto *Phi = dyn_cast<VPReductionPHIRecipe>(R.getOperand(0));
if (Phi && Phi->isOrdered()) {
Expand Down Expand Up @@ -379,19 +389,15 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
// requiring it.
if (isa<VPWidenCanonicalIVRecipe>(Copy))
Copy->addOperand(getConstantInt(Part));

if (match(Copy,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>())) {
VPBuilder Builder(Copy);
VPValue *ScaledByPart = Builder.createOverflowingOp(
Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)});
Copy->setOperand(1, ScaledByPart);
}
}
if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R)) {
// Materialize Part0 offset for VectorEndPointer.
// Materialize Part0 offset for various recipes.
if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(&R))
VPR->addOperand(Plan.getZero(
Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR))));
if (match(&R, m_CanonicalIVIncrement()))
R.addOperand(getConstantInt(0));
if (auto *VEPR = dyn_cast<VPVectorEndPointerRecipe>(&R))
VEPR->materializeOffset();
}
}

void UnrollState::unrollBlock(VPBlockBase *VPB) {
Expand Down Expand Up @@ -488,20 +494,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
assert(UF > 0 && "Unroll factor must be positive");
Plan.setUF(UF);
llvm::scope_exit Cleanup([&Plan, UF]() {
auto Iter = vp_depth_first_deep(Plan.getEntry());
// Remove recipes that are redundant after unrolling.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *VPI = dyn_cast<VPInstruction>(&R);
if (VPI &&
VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart &&
VPI->getOperand(1) == &Plan.getVF()) {
VPI->replaceAllUsesWith(VPI->getOperand(0));
VPI->eraseFromParent();
}
}
}

Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF));
});
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
VPlan *Plan = VPBB ? VPBB->getPlan() : nullptr;
if (VPBB &&
(VPBB == Plan->getVectorPreheader() || VPBB == Plan->getEntry())) {
if (match(V->getDefiningRecipe(),
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
if (match(V->getDefiningRecipe(), m_CanonicalIVIncrement()))
return false;
return all_of(R->operands(), isUniformAcrossVFsAndUFs);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,10 +304,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
Expand Down Expand Up @@ -1320,10 +1320,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
Expand Down Expand Up @@ -1572,10 +1572,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]]
; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3
; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]]
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP61]], 2
; CHECK-NEXT: [[TMP62:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
Expand Down Expand Up @@ -79,10 +79,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias %
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP5]], 2
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP1]], 2
; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3
; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]]
; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1
; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]]
; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3
; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]]
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])
Expand Down