diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index cc72a37f68599..eaee1d177986b 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4442,10 +4442,11 @@ elementtype may be any integer, floating-point, pointer type, or a sized target extension type that has the ``CanBeVectorElement`` property. Vectors of size zero are not allowed. For scalable vectors, the total number of elements is a constant multiple (called vscale) of the specified number -of elements; vscale is a positive integer that is unknown at compile time -and the same hardware-dependent constant for all scalable vectors at run -time. The size of a specific scalable vector type is thus constant within -IR, even if the exact size in bytes cannot be determined until run time. +of elements; vscale is a positive power-of-two integer that is unknown +at compile time and the same hardware-dependent constant for all scalable +vectors at run time. The size of a specific scalable vector type is thus +constant within IR, even if the exact size in bytes cannot be determined +until run time. :Examples: @@ -30398,8 +30399,8 @@ vectors such as ````. Semantics: """""""""" -``vscale`` is a positive value that is constant throughout program -execution, but is unknown at compile time. +``vscale`` is a positive power-of-two value that is constant throughout +program execution, but is unknown at compile time. If the result value does not fit in the result type, then the result is a :ref:`poison value `. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ba47cef274bec..d322f8a3dbc03 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1220,9 +1220,6 @@ class TargetTransformInfo { /// \return the value of vscale to tune the cost model for. LLVM_ABI std::optional getVScaleForTuning() const; - /// \return true if vscale is known to be a power of 2 - LLVM_ABI bool isVScaleKnownToBeAPowerOfTwo() const; - /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 640766cf8cd10..c5fde665274f3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -591,7 +591,6 @@ class TargetTransformInfoImplBase { virtual std::optional getVScaleForTuning() const { return std::nullopt; } - virtual bool isVScaleKnownToBeAPowerOfTwo() const { return false; } virtual bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 90a75c3d352e4..5fac89d7c31c9 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -864,7 +864,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { std::optional getVScaleForTuning() const override { return std::nullopt; } - bool isVScaleKnownToBeAPowerOfTwo() const override { return false; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index dd44afd0855a5..e48e6e1fe2c19 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -623,9 +623,6 @@ class LLVM_ABI TargetLoweringBase { return BypassSlowDivWidths; } - /// Return true only if vscale must be a power of two. - virtual bool isVScaleKnownToBeAPowerOfTwo() const { return false; } - /// Return true if Flow Control is an expensive operation that should be /// avoided. bool isJumpExpensive() const { return JumpIsExpensive; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8cc7f8a9d2ab2..ef26588d4f27a 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -795,10 +795,6 @@ std::optional TargetTransformInfo::getVScaleForTuning() const { return TTIImpl->getVScaleForTuning(); } -bool TargetTransformInfo::isVScaleKnownToBeAPowerOfTwo() const { - return TTIImpl->isVScaleKnownToBeAPowerOfTwo(); -} - bool TargetTransformInfo::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { return TTIImpl->shouldMaximizeVectorBandwidth(K); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 73320b556f825..5252eef3b298f 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -2474,11 +2474,9 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, if (!I) return false; - if (Q.CxtI && match(V, m_VScale())) { - const Function *F = Q.CxtI->getFunction(); - // The vscale_range indicates vscale is a power-of-two. - return F->hasFnAttribute(Attribute::VScaleRange); - } + // vscale is a power-of-two by definition + if (match(V, m_VScale())) + return true; // 1 << X is clearly a power of two if the one is not shifted off the end. If // it is shifted off the end then the result is undefined. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5d8db8be9731f..1568875cb37ce 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4660,7 +4660,6 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const { // vscale(power-of-two) is a power-of-two for some targets if (Val.getOpcode() == ISD::VSCALE && - getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() && isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1)) return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index e0b6c1b8c0baf..6aa8d7afffc01 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -517,8 +517,6 @@ class AArch64TargetLowering : public TargetLowering { SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM = SDValue()) const; - bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } - // Normally SVE is only used for byte size vectors that do not fit within a // NEON vector. This changes when OverrideNEON is true, allowing SVE to be // used for 64bit and 128bit vectors as well. diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 470af01be3154..62c86db2fa9c0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -156,8 +156,6 @@ class AArch64TTIImpl final : public BasicTTIImplBase { return ST->getVScaleForTuning(); } - bool isVScaleKnownToBeAPowerOfTwo() const override { return true; } - bool shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 139fa7ba35625..d7c065bf9527f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -23434,18 +23434,6 @@ const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry( return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); } -bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { - // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power - // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be - // a power of two as well. - // FIXME: This doesn't work for zve32, but that's already broken - // elsewhere for the same reason. - assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported"); - static_assert(RISCV::RVVBitsPerBlock == 64, - "RVVBitsPerBlock changed, audit needed"); - return true; -} - bool RISCVTargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index f67d7f155c9d0..a4f9fe088f38d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -394,8 +394,6 @@ class RISCVTargetLowering : public TargetLowering { unsigned uid, MCContext &Ctx) const override; - bool isVScaleKnownToBeAPowerOfTwo() const override; - bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const; bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 83ac71ed9da69..1afc9c659276b 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -335,10 +335,6 @@ class RISCVTTIImpl final : public BasicTTIImplBase { bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override; - bool isVScaleKnownToBeAPowerOfTwo() const override { - return TLI->isVScaleKnownToBeAPowerOfTwo(); - } - /// \returns How the target needs this vector-predicated operation to be /// transformed. TargetTransformInfo::VPLegalization diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f28c2ce0acc98..dcb18cffbeada 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2429,20 +2429,6 @@ Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF, // check is known to be true, or known to be false. CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check"); } // else step known to be < trip count, use CheckMinIters preset to false. - } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() && - !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) && - Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // vscale is not necessarily a power-of-2, which means we cannot guarantee - // an overflow to zero when updating induction variables and so an - // additional overflow check is required before entering the vector loop. - - // Get the maximum unsigned value for the type. - Value *MaxUIntTripCount = - ConstantInt::get(CountTy, cast(CountTy)->getMask()); - Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count); - - // Don't execute the vector loop if (UMax - n) < (VF * UF). - CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep()); } return CheckMinIters; } @@ -3830,7 +3816,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { MaxFactors.FixedVF.getFixedValue(); if (MaxFactors.ScalableVF) { std::optional MaxVScale = getMaxVScale(*TheFunction, TTI); - if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) { + if (MaxVScale) { MaxPowerOf2RuntimeVF = std::max( *MaxPowerOf2RuntimeVF, *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue()); diff --git a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll index 920497c07e380..ea86750e762db 100644 --- a/llvm/test/Transforms/InstCombine/rem-mul-shl.ll +++ b/llvm/test/Transforms/InstCombine/rem-mul-shl.ll @@ -859,7 +859,8 @@ define i64 @urem_shl_vscale() { ; CHECK-LABEL: @urem_shl_vscale( ; CHECK-NEXT: [[VSCALE:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[SHIFT:%.*]] = shl nuw nsw i64 [[VSCALE]], 2 -; CHECK-NEXT: [[REM:%.*]] = urem i64 1024, [[SHIFT]] +; CHECK-NEXT: [[TMP1:%.*]] = add nuw i64 [[SHIFT]], 2047 +; CHECK-NEXT: [[REM:%.*]] = and i64 [[TMP1]], 1024 ; CHECK-NEXT: ret i64 [[REM]] ; %vscale = call i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/InstSimplify/po2-shift-add-and-to-zero.ll b/llvm/test/Transforms/InstSimplify/po2-shift-add-and-to-zero.ll index 54dd1688ad916..8a5adb3396c2c 100644 --- a/llvm/test/Transforms/InstSimplify/po2-shift-add-and-to-zero.ll +++ b/llvm/test/Transforms/InstSimplify/po2-shift-add-and-to-zero.ll @@ -61,27 +61,6 @@ define i64 @test_pow2_or_zero(i64 %arg) { ret i64 %rem } -;; Make sure it doesn't work if the value isn't known to be a power of 2. -;; In this case a vscale without a `vscale_range` attribute on the function. -define i64 @no_pow2() { -; CHECK-LABEL: define i64 @no_pow2() { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1 -; CHECK-NEXT: [[REM:%.*]] = and i64 [[TMP1]], [[TMP3]] -; CHECK-NEXT: ret i64 [[REM]] -; -entry: - %0 = call i64 @llvm.vscale.i64() - %1 = shl i64 %0, 4 - %2 = shl i64 %0, 3 - %3 = add i64 %2, -1 - %rem = and i64 %1, %3 - ret i64 %rem -} - ;; Make sure it doesn't work if the shift on the -1 side is greater define i64 @minus_shift_greater(i64 %arg) { ; CHECK-LABEL: define i64 @minus_shift_greater diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll index 4f0637fd8db2f..ce8674bb02630 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll @@ -20,9 +20,8 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() @@ -42,7 +41,7 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index 8c2958769a615..a72e7729a4160 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -1373,9 +1373,8 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], 2044 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 1024 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() @@ -1411,8 +1410,8 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0 +; CHECK-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; entry: @@ -1467,9 +1466,8 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], 2044 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[DOTNEG]], 1024 ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -1500,8 +1498,8 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i64 [[N_VEC]], 0 +; CHECK-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; CHECK: scalar.ph: ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index f152dd308cb69..5d09549f33267 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -30,44 +30,43 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[INDEX]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) ; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP13]], [[TMP16]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, [[INTERLEAVED_MASK3]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = sext i32 [[TMP6]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP12]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP11]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP5]], [[TMP5]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, [[INTERLEAVED_MASK3]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_TAIL_FOLDING: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided1 @@ -80,42 +79,42 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPA:%.*]] = sub i32 1024, [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP2]], 64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl i32 [[INDEX]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv32i8.p0(ptr [[TMP12]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sext i32 [[TMP8]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP13]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP13]], [[TMP16]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, [[INTERLEAVED_MASK3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sext i32 [[TMP10]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP15]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP15]], [[TMP18]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP9]], [[TMP9]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -176,37 +175,36 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP8]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP9]], i32 1, splat (i1 true)) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP13]], i32 1, [[TMP10]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = zext nneg [[TMP5]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP6]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP7]], i32 1, splat (i1 true)) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = or disjoint [[TMP5]], splat (i32 1) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = zext nneg [[TMP9]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP10]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP11]], i32 1, [[TMP8]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_TAIL_FOLDING: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided2 @@ -219,35 +217,35 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPA:%.*]] = sub i32 1024, [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP2]], 64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP8]], i32 1, [[ACTIVE_LANE_MASK]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = or disjoint [[TMP6]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP12]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP13]], i32 1, [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP8]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint [[TMP8]], splat (i32 1) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP15]], i32 1, [[TMP12]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -304,40 +302,39 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[CONV3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP7]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP10]], i32 1, [[TMP8]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = or disjoint [[TMP7]], splat (i32 1) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = zext nneg [[TMP12]] to -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP14]], i32 1, [[TMP11]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = zext nneg [[TMP5]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP7]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP8]], i32 1, [[TMP6]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = or disjoint [[TMP5]], splat (i32 1) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP10]] to +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP12]], i32 1, [[TMP9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_TAIL_FOLDING: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided3 @@ -351,39 +348,39 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPA:%.*]] = sub i32 1024, [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP2]], 64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[CONV3]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP7]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = zext nneg [[TMP6]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP9]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP10]], i32 1, [[TMP8]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP11]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = or disjoint [[TMP6]], splat (i32 1) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = zext nneg [[TMP13]] to -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP14]] -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP15]], i32 1, [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = zext nneg [[TMP8]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP11]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 1), [[TMP12]], i32 1, [[TMP10]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP13]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = or disjoint [[TMP8]], splat (i32 1) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = zext nneg [[TMP15]] to +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP16]] +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( splat (i8 2), [[TMP17]], i32 1, [[TMP14]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: @@ -452,48 +449,47 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_TAIL_FOLDING: vector.body: ; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP7]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP8]], i32 1, [[INTERLEAVED_MASK]], poison) ; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = call @llvm.smax.nxv16i8( [[TMP9]], [[TMP10]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] ; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) ; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) -; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sext i32 [[TMP8]] to i64 -; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]] -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) -; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, [[INTERLEAVED_MASK3]]) -; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = sext i32 [[TMP6]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP13]], [[TMP14]], [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP5]], [[TMP5]], [[TMP5]], [[TMP5]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK3]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALAR_TAIL_FOLDING: middle.block: -; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_TAIL_FOLDING: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 @@ -506,46 +502,46 @@ define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPA:%.*]] = sub i32 1024, [[TMP3]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMPB:%.*]] = icmp ult i32 [[TMP2]], 64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = select i1 [[TMPB]], i32 [[TMPA]], i32 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = sub i32 1024, [[TMP3]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP2]], 64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] ; PREDICATED_TAIL_FOLDING: vector.body: ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP8]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP11]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP9]], [[TMP9]], [[TMP9]], [[TMP9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP12]], i32 1, [[INTERLEAVED_MASK]], poison) ; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64i8( [[WIDE_MASKED_VEC]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = call @llvm.smax.nxv16i8( [[TMP11]], [[TMP12]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = sub zeroinitializer, [[TMP15]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = sext i32 [[TMP8]] to i64 -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP19]] -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP15]], [[TMP16]], [[TMP17]], [[TMP18]]) -; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP7]], [[TMP7]], [[TMP7]], [[TMP7]]) -; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP20]], i32 1, [[INTERLEAVED_MASK3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP10]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64i8( [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK3:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP9]], [[TMP9]], [[TMP9]], [[TMP9]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK3]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP6]]) ; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 -; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; PREDICATED_TAIL_FOLDING: middle.block: ; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_END:%.*]] ; PREDICATED_TAIL_FOLDING: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index 33fa3607730da..c3cc01c7b3da3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -20,12 +20,12 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3 ; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP4]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = sub nsw i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -55,7 +55,7 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; @@ -99,12 +99,12 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP5]], -16 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP7]], 4 +; CHECK-NEXT: [[TMP29:%.*]] = sub nsw i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -134,7 +134,7 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll index 5848d317ff145..f2c5b5e82b10c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll @@ -19,9 +19,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -36,9 +35,10 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = sub i64 [[N]], [[N_VEC]] ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 1 ; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP23]] @@ -46,9 +46,8 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ] ; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP25:%.*]] = shl nuw i64 [[TMP24]], 1 -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[N]], [[TMP25]] -; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]] +; CHECK-NEXT: [[DOTNEG8:%.*]] = mul i64 [[TMP24]], -2 +; CHECK-NEXT: [[N_VEC3:%.*]] = and i64 [[N]], [[DOTNEG8]] ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 1 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -63,7 +62,7 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 % ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N_MOD_VF2]], 0 +; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC3]] ; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll index abee8b9340caa..bb9166b8143da 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-too-many-deps.ll @@ -67,9 +67,8 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: br i1 [[CONFLICT_RDX9]], label [[SCALAR_PH]], label [[ENTRY:%.*]] ; NORMAL_DEP_LIMIT: vector.ph: ; NORMAL_DEP_LIMIT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; NORMAL_DEP_LIMIT-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP4]], 2 -; NORMAL_DEP_LIMIT-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]] -; NORMAL_DEP_LIMIT-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NORMAL_DEP_LIMIT-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP4]], -4 +; NORMAL_DEP_LIMIT-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; NORMAL_DEP_LIMIT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; NORMAL_DEP_LIMIT-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 ; NORMAL_DEP_LIMIT-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv4i32() @@ -96,7 +95,7 @@ define void @many_deps(ptr noalias %buckets, ptr %array, ptr %indices, ptr %othe ; NORMAL_DEP_LIMIT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] ; NORMAL_DEP_LIMIT-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; NORMAL_DEP_LIMIT: middle.block: -; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; NORMAL_DEP_LIMIT-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; NORMAL_DEP_LIMIT-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] ; NORMAL_DEP_LIMIT: scalar.ph: ; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index c0548dd8ca545..a6ce65418e7dd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -14,45 +14,44 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_EPILOGUE: vector.ph: +; SCALAR_EPILOGUE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_EPILOGUE-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_EPILOGUE-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_EPILOGUE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_EPILOGUE-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_EPILOGUE-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_EPILOGUE-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_EPILOGUE-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_EPILOGUE-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALAR_EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_EPILOGUE: vector.body: ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP9]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP10]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = or disjoint [[TMP8]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = zext nneg [[TMP11]] to -; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP12]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP13]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) -; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = zext nneg [[TMP8]] to -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP15]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP14]], [[TMP16]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP14]] -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = zext nneg [[TMP11]] to -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP18]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP17]], [[TMP19]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_EPILOGUE-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 1) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = zext nneg [[TMP6]] to +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP7]] +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP8]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP6]], splat (i32 1) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = zext nneg [[TMP9]] to +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP10]] +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP11]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) +; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = zext nneg [[TMP6]] to +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP13]] +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP12]], [[TMP14]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = sub zeroinitializer, [[TMP12]] +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = zext nneg [[TMP9]] to +; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP16]] +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP15]], [[TMP17]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: -; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_EPILOGUE-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_EPILOGUE: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor2 @@ -64,8 +63,8 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTNOT:%.*]] = sub i32 0, [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], [[DOTNOT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 @@ -182,61 +181,60 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; SCALAR_EPILOGUE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64 ; SCALAR_EPILOGUE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; SCALAR_EPILOGUE: vector.ph: +; SCALAR_EPILOGUE-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_EPILOGUE-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP1]], 2032 +; SCALAR_EPILOGUE-NEXT: [[N_VEC:%.*]] = and i32 [[DOTNEG]], 1024 ; SCALAR_EPILOGUE-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; SCALAR_EPILOGUE-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 -; SCALAR_EPILOGUE-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] -; SCALAR_EPILOGUE-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] -; SCALAR_EPILOGUE-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() -; SCALAR_EPILOGUE-NEXT: [[TMP5:%.*]] = shl nuw i32 [[TMP4]], 4 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() -; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_EPILOGUE-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; SCALAR_EPILOGUE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; SCALAR_EPILOGUE-NEXT: br label [[VECTOR_BODY:%.*]] ; SCALAR_EPILOGUE: vector.body: ; SCALAR_EPILOGUE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] -; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP8]], splat (i32 1) -; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = or disjoint [[TMP8]], splat (i32 2) -; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = or disjoint [[TMP8]], splat (i32 3) -; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = zext nneg [[TMP8]] to +; SCALAR_EPILOGUE-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_EPILOGUE-NEXT: [[TMP5:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_EPILOGUE-NEXT: [[TMP6:%.*]] = shl nuw nsw [[VEC_IND]], splat (i32 2) +; SCALAR_EPILOGUE-NEXT: [[TMP7:%.*]] = or disjoint [[TMP6]], splat (i32 1) +; SCALAR_EPILOGUE-NEXT: [[TMP8:%.*]] = or disjoint [[TMP6]], splat (i32 2) +; SCALAR_EPILOGUE-NEXT: [[TMP9:%.*]] = or disjoint [[TMP6]], splat (i32 3) +; SCALAR_EPILOGUE-NEXT: [[TMP10:%.*]] = zext nneg [[TMP6]] to +; SCALAR_EPILOGUE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP10]] +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP11]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP12:%.*]] = zext nneg [[TMP7]] to ; SCALAR_EPILOGUE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP12]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP13]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP9]] to +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP13]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP14:%.*]] = zext nneg [[TMP8]] to ; SCALAR_EPILOGUE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP14]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP15]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = zext nneg [[TMP10]] to +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP15]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP16:%.*]] = zext nneg [[TMP9]] to ; SCALAR_EPILOGUE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP16]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP17]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = zext nneg [[TMP11]] to -; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[P]], [[TMP18]] -; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP19]], i32 1, [[TMP7]], poison) -; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) +; SCALAR_EPILOGUE-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call @llvm.masked.gather.nxv16i8.nxv16p0( [[TMP17]], i32 1, [[TMP5]], poison) +; SCALAR_EPILOGUE-NEXT: [[TMP18:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER3]]) +; SCALAR_EPILOGUE-NEXT: [[TMP19:%.*]] = sub zeroinitializer, [[TMP18]] +; SCALAR_EPILOGUE-NEXT: [[TMP20:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) ; SCALAR_EPILOGUE-NEXT: [[TMP21:%.*]] = sub zeroinitializer, [[TMP20]] -; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = call @llvm.smax.nxv16i8( [[WIDE_MASKED_GATHER4]], [[WIDE_MASKED_GATHER5]]) -; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = sub zeroinitializer, [[TMP22]] -; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = zext nneg [[TMP8]] to +; SCALAR_EPILOGUE-NEXT: [[TMP22:%.*]] = zext nneg [[TMP6]] to +; SCALAR_EPILOGUE-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP22]] +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP18]], [[TMP23]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[TMP24:%.*]] = zext nneg [[TMP7]] to ; SCALAR_EPILOGUE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP24]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP25]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = zext nneg [[TMP9]] to +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP19]], [[TMP25]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[TMP26:%.*]] = zext nneg [[TMP8]] to ; SCALAR_EPILOGUE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP26]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP27]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = zext nneg [[TMP10]] to +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP20]], [[TMP27]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[TMP28:%.*]] = zext nneg [[TMP9]] to ; SCALAR_EPILOGUE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP28]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP22]], [[TMP29]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = zext nneg [[TMP11]] to -; SCALAR_EPILOGUE-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[Q]], [[TMP30]] -; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP23]], [[TMP31]], i32 1, [[TMP7]]) -; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_EPILOGUE-NEXT: call void @llvm.masked.scatter.nxv16i8.nxv16p0( [[TMP21]], [[TMP29]], i32 1, [[TMP5]]) +; SCALAR_EPILOGUE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]] ; SCALAR_EPILOGUE-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] -; SCALAR_EPILOGUE-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; SCALAR_EPILOGUE-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALAR_EPILOGUE-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_EPILOGUE-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALAR_EPILOGUE: middle.block: -; SCALAR_EPILOGUE-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 -; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_EPILOGUE-NEXT: [[CMP_N_NOT:%.*]] = icmp eq i32 [[N_VEC]], 0 +; SCALAR_EPILOGUE-NEXT: br i1 [[CMP_N_NOT]], label [[SCALAR_PH]], label [[FOR_END:%.*]] ; SCALAR_EPILOGUE: scalar.ph: ; ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4 @@ -248,8 +246,8 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023 -; PREDICATED_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]] -; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTNOT:%.*]] = sub i32 0, [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], [[DOTNOT]] ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() ; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4 ; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index 62649412deb89..b4174e347c2dd 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -16,9 +16,8 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 @@ -49,7 +48,7 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -97,9 +96,8 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 1 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -2 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv1i64() @@ -127,7 +125,7 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -182,9 +180,8 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2 ; CHECK-NEXT: [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32 @@ -206,7 +203,7 @@ define void @add_unique_ind32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] @@ -259,9 +256,8 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i64 [[TMP2]], -4 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 ; CHECK-NEXT: [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float @@ -286,7 +282,7 @@ define void @add_unique_indf32(ptr noalias nocapture %a, i64 %n) { ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll index f384d3c15ca6e..f145759570c00 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -9,9 +9,8 @@ ; CHECKUF1: vector.ph: ; CHECKUF1-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF1-DAG: %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2 -; CHECKUF1-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX4]] -; CHECKUF1: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf +; CHECKUF1-DAG: %.neg = mul i64 %[[VSCALE]], 2147483644 +; CHECKUF1-DAG: %n.vec = and i64 %.neg, %wide.trip.count ; CHECKUF1: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF1: %[[VSCALEX4:.*]] = shl nuw i64 %[[VSCALE]], 2 @@ -38,9 +37,8 @@ ; CHECKUF2: vector.ph: ; CHECKUF2-DAG: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() -; CHECKUF2-DAG: %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3 -; CHECKUF2-DAG: %n.mod.vf = urem i64 %wide.trip.count, %[[VSCALEX8]] -; CHECKUF2: %n.vec = sub nsw i64 %wide.trip.count, %n.mod.vf +; CHECKUF2-DAG: %.neg = mul i64 %[[VSCALE]], 2147483640 +; CHECKUF2: %n.vec = and i64 %.neg, %wide.trip.count ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALEX8:.*]] = shl nuw i64 %[[VSCALE]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/scalable-predication.ll b/llvm/test/Transforms/LoopVectorize/scalable-predication.ll deleted file mode 100644 index 8e272debb299d..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/scalable-predication.ll +++ /dev/null @@ -1,65 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=loop-vectorize -force-tail-folding-style=data -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-target-supports-scalable-vectors -S < %s | FileCheck %s - -; vscale is not guaranteed to be a power of two, so this test (which -; deliberately doesn't correspond to an in-tree backend since those -; *do* have vscale as power-of-two) exercises the code required for the -; minimum iteration check in the non-power-of-two case. -define void @foo(i32 %val, ptr dereferenceable(1024) %ptr) { -; CHECK-LABEL: @foo( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp ult i64 -257, [[TMP7]] -; CHECK-NEXT: br i1 [[TMP8]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 256, [[TMP2]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 256) -; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[WHILE_BODY:%.*]] -; CHECK: while.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[GEP]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void -; -entry: - br label %while.body - -while.body: ; preds = %while.body, %entry - %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ] - %gep = getelementptr i32, ptr %ptr, i64 %index - %ld1 = load i32, ptr %gep, align 4 - %index.next = add nsw i64 %index, 1 - %cmp10 = icmp ult i64 %index.next, 256 - br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0 - -while.end.loopexit: ; preds = %while.body - ret void -} - -!0 = distinct !{!0, !1, !2, !3, !4} -!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} -!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} -!3 = !{!"llvm.loop.interleave.count", i32 1} -!4 = !{!"llvm.loop.vectorize.width", i32 4}