diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 80df058dfcf66..2c047a590e2be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1211,7 +1211,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, - // Increment the canonical IV separately for each unrolled part. + // Increment the canonical IV separately for each unrolled part. Meant to be + // constructed with two operands, namely the Start value and VF. Unrolling + // and conversion to concrete recipes add an extra Offset operand and the + // recipe produces `add Start, Offset`. The offset for unrolled part 0 is 0. CanonicalIVIncrementForPart, // Abstract instruction that compares two values and branches. This is // lowered to ICmp + BranchOnCond during VPlan to VPlan transformation. @@ -2192,8 +2195,9 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags { }; /// A recipe to compute the pointers for widened memory accesses of \p -/// SourceElementTy. Unrolling adds an extra offset operand for unrolled parts > -/// 0 and it produces `GEP Ptr, Offset`. The offset for unrolled part 0 is 0. +/// SourceElementTy. Unrolling and conversion to concrete recipes add an extra +/// offset operand and the recipe produces `GEP Ptr, Offset`. The offset for +/// unrolled part 0 is 0. class VPVectorPointerRecipe : public VPRecipeWithIRFlags { Type *SourceElementTy; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 1205f04fb5c29..062782785c817 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -866,6 +866,17 @@ inline auto m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Select(Op0, m_True(), Op1); } +inline auto m_CanonicalIVIncrement() { + return VPInstruction_match(); +} + +template +inline auto m_CanonicalIVIncrement(const Op0_t &Op0, const Op1_t &Op1, + const Op2_t &Op2) { + return VPInstruction_match(Op0, Op1, Op2); +} + inline auto m_CanonicalIV() { return class_match(); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f8fa7d3c44ce1..0b04a8f19769b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -656,8 +656,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { + assert(getNumOperands() == 3 && + "Expected prior simplification of recipe without offset"); auto *IV = State.get(getOperand(0), VPLane(0)); - auto *VFxPart = State.get(getOperand(1), VPLane(0)); + auto *VFxPart = State.get(getOperand(2), VPLane(0)); // The canonical IV is incremented by the vectorization factor (num of // SIMD elements) times the unroll part. return Builder.CreateAdd(IV, VFxPart, Name, hasNoUnsignedWrap(), diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 334280a7d80f5..1b14ef1a37b03 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1620,12 +1620,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } } - // Simplify unrolled VectorPointer without offset, or with zero offset, to - // just the pointer operand. + // Simplify unrolled VectorPointer with no offset or zero offset. if (auto *VPR = dyn_cast(Def)) if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt())) return VPR->replaceAllUsesWith(VPR->getOperand(0)); + // Simplify unrolled CanonicalIVIncrement with no offset or zero offset. + if (match(Def, m_CanonicalIVIncrement())) + if (Def->getNumOperands() == 2 || match(Def->getOperand(2), m_ZeroInt())) + return Def->replaceAllUsesWith(Def->getOperand(0)); + // VPScalarIVSteps after unrolling can be replaced by their start value, if // the start index is zero and only the first lane 0 is demanded. if (auto *Steps = dyn_cast(Def)) { @@ -2064,18 +2068,18 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); assert(Index && "Expected index from ActiveLaneMask instruction"); - uint64_t Part; if (match(Index, - m_VPInstruction( - m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part))))) - Phis[Part] = Phi; - else { - // Anything other than a CanonicalIVIncrementForPart is part 0 - assert(!match( - Index, - m_VPInstruction())); + m_CanonicalIVIncrement(m_VPValue(), m_VPValue(), m_ZeroInt()))) { + // Part 0 CanonicalIVIncrement. Phis[0] = Phi; + continue; } + uint64_t Part; + VPValue *VF; + assert(match(Index, m_CanonicalIVIncrement( + m_VPValue(), m_VPValue(VF), + m_c_Mul(m_Deferred(VF), m_ConstantInt(Part))))); + Phis[Part] = Phi; } assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && @@ -3986,6 +3990,24 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { ToRemove.push_back(Blend); } + // Materialize Part0 offsets for various recipes when UF = 1. When UF > 1, + // the unroller would have added Part0 offsets. + if (auto *VPR = dyn_cast(&R)) { + if (!VPR->getOffset()) { + assert(Plan.getConcreteUF() == 1 && + "Expected unroller to have materialized offset for UF != 1"); + VPR->addOperand(Plan.getZero(Plan.getDataLayout().getIndexType( + TypeInfo.inferScalarType(VPR)))); + } + } + if (match(&R, m_CanonicalIVIncrement())) { + if (R.getNumOperands() == 2) { + assert(Plan.getConcreteUF() == 1 && + "Expected unroller to have materialized offset for UF != 1"); + R.addOperand( + Plan.getZero(Plan.getVectorLoopRegion()->getCanonicalIVType())); + } + } if (auto *VEPR = dyn_cast(&R)) { if (!VEPR->getOffset()) { assert(Plan.getConcreteUF() == 1 && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 22d4973a4c1d8..28c0d76fff582 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -337,8 +337,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { } if (auto *VPR = dyn_cast(&R)) { VPBuilder Builder(VPR); - const DataLayout &DL = Plan.getDataLayout(); - Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR)); + Type *IndexTy = + Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR)); Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF()); VPValue *VF = Builder.createScalarZExtOrTrunc( &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown()); @@ -350,6 +350,16 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { Copy->addOperand(VFxPart); continue; } + if (match(&R, m_CanonicalIVIncrement())) { + VPBuilder Builder(&R); + VPValue *VFxPart = Builder.createOverflowingOp( + Instruction::Mul, {&Plan.getVF(), getConstantInt(Part)}, + {true, true}); + Copy->setOperand(0, R.getOperand(0)); + Copy->setOperand(1, R.getOperand(1)); + Copy->addOperand(VFxPart); + continue; + } if (auto *Red = dyn_cast(&R)) { auto *Phi = dyn_cast(R.getOperand(0)); if (Phi && Phi->isOrdered()) { @@ -379,19 +389,15 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { // requiring it. if (isa(Copy)) Copy->addOperand(getConstantInt(Part)); - - if (match(Copy, - m_VPInstruction())) { - VPBuilder Builder(Copy); - VPValue *ScaledByPart = Builder.createOverflowingOp( - Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)}); - Copy->setOperand(1, ScaledByPart); - } } - if (auto *VEPR = dyn_cast(&R)) { - // Materialize Part0 offset for VectorEndPointer. + // Materialize Part0 offset for various recipes. + if (auto *VPR = dyn_cast(&R)) + VPR->addOperand(Plan.getZero( + Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR)))); + if (match(&R, m_CanonicalIVIncrement())) + R.addOperand(getConstantInt(0)); + if (auto *VEPR = dyn_cast(&R)) VEPR->materializeOffset(); - } } void UnrollState::unrollBlock(VPBlockBase *VPB) { @@ -488,20 +494,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { assert(UF > 0 && "Unroll factor must be positive"); Plan.setUF(UF); llvm::scope_exit Cleanup([&Plan, UF]() { - auto Iter = vp_depth_first_deep(Plan.getEntry()); - // Remove recipes that are redundant after unrolling. - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - auto *VPI = dyn_cast(&R); - if (VPI && - VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && - VPI->getOperand(1) == &Plan.getVF()) { - VPI->replaceAllUsesWith(VPI->getOperand(0)); - VPI->eraseFromParent(); - } - } - } - Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF)); }); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index ae6fb8e3e9cf5..cee48dcb2ac64 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -403,8 +403,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { VPlan *Plan = VPBB ? VPBB->getPlan() : nullptr; if (VPBB && (VPBB == Plan->getVectorPreheader() || VPBB == Plan->getEntry())) { - if (match(V->getDefiningRecipe(), - m_VPInstruction())) + if (match(V->getDefiningRecipe(), m_CanonicalIVIncrement())) return false; return all_of(R->operands(), isUniformAcrossVFsAndUFs); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index b897982700400..b33f2c35b85cd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -304,10 +304,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) @@ -1320,10 +1320,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) @@ -1572,10 +1572,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 396f9b5a93ddb..6e4655d6fe773 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -13,10 +13,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP61]], 2 ; CHECK-NEXT: [[TMP62:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) @@ -79,10 +79,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])