From 7de749d12bb24005df8a8fe8c18a22aaeb1e6dfb Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Fri, 13 Mar 2026 09:57:20 +0000 Subject: [PATCH] [VPlan] Robustly handle Part0 for VecPointer, CanIVInc Add an extra Offset operand to CanonicalIVIncrement instead of conflating VF operand on the non-unrolled recipe with 1 * VF on Part 1 of the unrolled recipe, and create Part0 offsets for VectorPointer and CanonicalIVIncrement recipes, exactly as we do it for VectorEndPointer, to make the VPlan logic more robust. The patch is mostly non-functional, with the exception of some wrap-flags. --- llvm/lib/Transforms/Vectorize/VPlan.h | 10 ++-- .../Transforms/Vectorize/VPlanPatternMatch.h | 11 +++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +- .../Transforms/Vectorize/VPlanTransforms.cpp | 44 +++++++++++++----- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 46 ++++++++----------- llvm/lib/Transforms/Vectorize/VPlanUtils.cpp | 3 +- .../AArch64/scalable-strict-fadd.ll | 12 ++--- .../AArch64/sve-tail-folding-unroll.ll | 8 ++-- 8 files changed, 84 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 80df058dfcf66..2c047a590e2be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1211,7 +1211,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, - // Increment the canonical IV separately for each unrolled part. + // Increment the canonical IV separately for each unrolled part. Meant to be + // constructed with two operands, namely the Start value and VF. Unrolling + // and conversion to concrete recipes add an extra Offset operand and the + // recipe produces `add Start, Offset`. The offset for unrolled part 0 is 0. CanonicalIVIncrementForPart, // Abstract instruction that compares two values and branches. This is // lowered to ICmp + BranchOnCond during VPlan to VPlan transformation. @@ -2192,8 +2195,9 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags { }; /// A recipe to compute the pointers for widened memory accesses of \p -/// SourceElementTy. Unrolling adds an extra offset operand for unrolled parts > -/// 0 and it produces `GEP Ptr, Offset`. The offset for unrolled part 0 is 0. +/// SourceElementTy. Unrolling and conversion to concrete recipes add an extra +/// offset operand and the recipe produces `GEP Ptr, Offset`. The offset for +/// unrolled part 0 is 0. class VPVectorPointerRecipe : public VPRecipeWithIRFlags { Type *SourceElementTy; diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 1205f04fb5c29..062782785c817 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -866,6 +866,17 @@ inline auto m_c_LogicalOr(const Op0_t &Op0, const Op1_t &Op1) { return m_c_Select(Op0, m_True(), Op1); } +inline auto m_CanonicalIVIncrement() { + return VPInstruction_match(); +} + +template +inline auto m_CanonicalIVIncrement(const Op0_t &Op0, const Op1_t &Op1, + const Op2_t &Op2) { + return VPInstruction_match(Op0, Op1, Op2); +} + inline auto m_CanonicalIV() { return class_match(); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index f8fa7d3c44ce1..0b04a8f19769b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -656,8 +656,10 @@ Value *VPInstruction::generate(VPTransformState &State) { return EVL; } case VPInstruction::CanonicalIVIncrementForPart: { + assert(getNumOperands() == 3 && + "Expected prior simplification of recipe without offset"); auto *IV = State.get(getOperand(0), VPLane(0)); - auto *VFxPart = State.get(getOperand(1), VPLane(0)); + auto *VFxPart = State.get(getOperand(2), VPLane(0)); // The canonical IV is incremented by the vectorization factor (num of // SIMD elements) times the unroll part. return Builder.CreateAdd(IV, VFxPart, Name, hasNoUnsignedWrap(), diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 334280a7d80f5..1b14ef1a37b03 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1620,12 +1620,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } } - // Simplify unrolled VectorPointer without offset, or with zero offset, to - // just the pointer operand. + // Simplify unrolled VectorPointer with no offset or zero offset. if (auto *VPR = dyn_cast(Def)) if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt())) return VPR->replaceAllUsesWith(VPR->getOperand(0)); + // Simplify unrolled CanonicalIVIncrement with no offset or zero offset. + if (match(Def, m_CanonicalIVIncrement())) + if (Def->getNumOperands() == 2 || match(Def->getOperand(2), m_ZeroInt())) + return Def->replaceAllUsesWith(Def->getOperand(0)); + // VPScalarIVSteps after unrolling can be replaced by their start value, if // the start index is zero and only the first lane 0 is demanded. if (auto *Steps = dyn_cast(Def)) { @@ -2064,18 +2068,18 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); assert(Index && "Expected index from ActiveLaneMask instruction"); - uint64_t Part; if (match(Index, - m_VPInstruction( - m_VPValue(), m_Mul(m_VPValue(), m_ConstantInt(Part))))) - Phis[Part] = Phi; - else { - // Anything other than a CanonicalIVIncrementForPart is part 0 - assert(!match( - Index, - m_VPInstruction())); + m_CanonicalIVIncrement(m_VPValue(), m_VPValue(), m_ZeroInt()))) { + // Part 0 CanonicalIVIncrement. Phis[0] = Phi; + continue; } + uint64_t Part; + VPValue *VF; + assert(match(Index, m_CanonicalIVIncrement( + m_VPValue(), m_VPValue(VF), + m_c_Mul(m_Deferred(VF), m_ConstantInt(Part))))); + Phis[Part] = Phi; } assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && @@ -3986,6 +3990,24 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) { ToRemove.push_back(Blend); } + // Materialize Part0 offsets for various recipes when UF = 1. When UF > 1, + // the unroller would have added Part0 offsets. + if (auto *VPR = dyn_cast(&R)) { + if (!VPR->getOffset()) { + assert(Plan.getConcreteUF() == 1 && + "Expected unroller to have materialized offset for UF != 1"); + VPR->addOperand(Plan.getZero(Plan.getDataLayout().getIndexType( + TypeInfo.inferScalarType(VPR)))); + } + } + if (match(&R, m_CanonicalIVIncrement())) { + if (R.getNumOperands() == 2) { + assert(Plan.getConcreteUF() == 1 && + "Expected unroller to have materialized offset for UF != 1"); + R.addOperand( + Plan.getZero(Plan.getVectorLoopRegion()->getCanonicalIVType())); + } + } if (auto *VEPR = dyn_cast(&R)) { if (!VEPR->getOffset()) { assert(Plan.getConcreteUF() == 1 && diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 22d4973a4c1d8..28c0d76fff582 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -337,8 +337,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { } if (auto *VPR = dyn_cast(&R)) { VPBuilder Builder(VPR); - const DataLayout &DL = Plan.getDataLayout(); - Type *IndexTy = DL.getIndexType(TypeInfo.inferScalarType(VPR)); + Type *IndexTy = + Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR)); Type *VFTy = TypeInfo.inferScalarType(&Plan.getVF()); VPValue *VF = Builder.createScalarZExtOrTrunc( &Plan.getVF(), IndexTy, VFTy, DebugLoc::getUnknown()); @@ -350,6 +350,16 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { Copy->addOperand(VFxPart); continue; } + if (match(&R, m_CanonicalIVIncrement())) { + VPBuilder Builder(&R); + VPValue *VFxPart = Builder.createOverflowingOp( + Instruction::Mul, {&Plan.getVF(), getConstantInt(Part)}, + {true, true}); + Copy->setOperand(0, R.getOperand(0)); + Copy->setOperand(1, R.getOperand(1)); + Copy->addOperand(VFxPart); + continue; + } if (auto *Red = dyn_cast(&R)) { auto *Phi = dyn_cast(R.getOperand(0)); if (Phi && Phi->isOrdered()) { @@ -379,19 +389,15 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { // requiring it. if (isa(Copy)) Copy->addOperand(getConstantInt(Part)); - - if (match(Copy, - m_VPInstruction())) { - VPBuilder Builder(Copy); - VPValue *ScaledByPart = Builder.createOverflowingOp( - Instruction::Mul, {Copy->getOperand(1), getConstantInt(Part)}); - Copy->setOperand(1, ScaledByPart); - } } - if (auto *VEPR = dyn_cast(&R)) { - // Materialize Part0 offset for VectorEndPointer. + // Materialize Part0 offset for various recipes. + if (auto *VPR = dyn_cast(&R)) + VPR->addOperand(Plan.getZero( + Plan.getDataLayout().getIndexType(TypeInfo.inferScalarType(VPR)))); + if (match(&R, m_CanonicalIVIncrement())) + R.addOperand(getConstantInt(0)); + if (auto *VEPR = dyn_cast(&R)) VEPR->materializeOffset(); - } } void UnrollState::unrollBlock(VPBlockBase *VPB) { @@ -488,20 +494,6 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) { assert(UF > 0 && "Unroll factor must be positive"); Plan.setUF(UF); llvm::scope_exit Cleanup([&Plan, UF]() { - auto Iter = vp_depth_first_deep(Plan.getEntry()); - // Remove recipes that are redundant after unrolling. - for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly(Iter)) { - for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { - auto *VPI = dyn_cast(&R); - if (VPI && - VPI->getOpcode() == VPInstruction::CanonicalIVIncrementForPart && - VPI->getOperand(1) == &Plan.getVF()) { - VPI->replaceAllUsesWith(VPI->getOperand(0)); - VPI->eraseFromParent(); - } - } - } - Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount()); Plan.getUF().replaceAllUsesWith(Plan.getConstantInt(TCTy, UF)); }); diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index ae6fb8e3e9cf5..cee48dcb2ac64 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -403,8 +403,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) { VPlan *Plan = VPBB ? VPBB->getPlan() : nullptr; if (VPBB && (VPBB == Plan->getVectorPreheader() || VPBB == Plan->getEntry())) { - if (match(V->getDefiningRecipe(), - m_VPInstruction())) + if (match(V->getDefiningRecipe(), m_CanonicalIVIncrement())) return false; return all_of(R->operands(), isUniformAcrossVFsAndUFs); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index b897982700400..b33f2c35b85cd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -304,10 +304,10 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) @@ -1320,10 +1320,10 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) @@ -1572,10 +1572,10 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 ; CHECK-ORDERED-TF-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-ORDERED-TF-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP3]] -; CHECK-ORDERED-TF-NEXT: [[TMP4:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-ORDERED-TF-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP4]] ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) ; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index 396f9b5a93ddb..6e4655d6fe773 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -13,10 +13,10 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP61]], 2 ; CHECK-NEXT: [[TMP62:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]]) @@ -79,10 +79,10 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP5]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP28:%.*]] = shl nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP30:%.*]] = mul nuw nsw i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP1]] -; CHECK-NEXT: [[TMP28:%.*]] = shl i64 [[TMP1]], 1 ; CHECK-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = mul i64 [[TMP1]], 3 ; CHECK-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP30]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]]) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[UMAX]])