diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 3e06f74fa5c65..47ebe2ca24340 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -143,6 +143,7 @@ class VectorCombine { bool foldShufflesOfLengthChangingShuffles(Instruction &I); bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); + bool compactShuffleOperands(Instruction &I); bool foldShuffleFromReductions(Instruction &I); bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); @@ -2762,6 +2763,239 @@ bool VectorCombine::foldShuffleOfCastops(Instruction &I) { return true; } +/// Describes whether and how a shuffle operand can be compacted. +struct ShuffleOperandCompaction { + /// The cost difference between compacted and original operand. Used to avoid + /// compactions that increase cost. Zero if compaction cannot be applied, but + /// note that valid compactions may also have zero cost. + InstructionCost Cost; + /// The minimal width required for the compacted vector. + unsigned CompactedWidth; + /// Function to create the compacted operand, or nullptr if no compaction can + /// be applied. + std::function &)> Apply; +}; + +/// Attempt to narrow/compact a constant vector used in a shuffle by removing +/// elements that are not referenced by the shuffle mask. +static ShuffleOperandCompaction +compactShuffleOperand(Constant *ShuffleInput, + MutableArrayRef UserShuffleMask, int IndexStart) { + auto *VecTy = cast(ShuffleInput->getType()); + unsigned Width = VecTy->getNumElements(); + + // Collect only the constant elements that are actually used. + SmallVector CompactedElts; + // Map from original element index to compacted index. + SmallVector IndexRemap(Width, -1); + + // Track whether used elements are already compacted at the front. Even if + // true, we may still shrink this operand by not re-adding trailing poison. + bool AlreadyCompacted = true; + + // This modifies UserShuffleMask, so we cannot back out of transforming the + // operand while proceeding with compactShuffleOperands on the instruction. + for (int &MaskElt : UserShuffleMask) { + if (MaskElt >= IndexStart && MaskElt < IndexStart + (int)Width) { + int RelMaskElt = MaskElt - IndexStart; + if (IndexRemap[RelMaskElt] < 0) { + IndexRemap[RelMaskElt] = CompactedElts.size() + IndexStart; + CompactedElts.push_back(ShuffleInput->getAggregateElement(RelMaskElt)); + } + if (IndexRemap[RelMaskElt] != MaskElt) { + AlreadyCompacted = false; + MaskElt = IndexRemap[RelMaskElt]; + } + } + } + + unsigned CompactedWidth = CompactedElts.size(); + + // To determine the eventual width (between CompactedWidth and Width), we have + // to consider the other operand. Hence, we return a functor here to delay. + return {0, CompactedWidth, + [ShuffleInput, AlreadyCompacted, Width, VecTy, + CompactedElts = std::move(CompactedElts)]( + unsigned PaddedWidth, + IRBuilder &Builder) -> Value * { + // Return original if unchanged to guarantee fixpoint termination. + if (AlreadyCompacted && Width == PaddedWidth) + return ShuffleInput; + + // Pad with poison to reach the requested width. + SmallVector PaddedElts(CompactedElts); + while (PaddedElts.size() < PaddedWidth) + PaddedElts.push_back(PoisonValue::get(VecTy->getElementType())); + + return ConstantVector::get(PaddedElts); + }}; +} + +/// Attempt to narrow/compact a shuffle instruction used in a shuffle by +/// removing elements that are not referenced by the shuffle mask. +static ShuffleOperandCompaction +compactShuffleOperand(ShuffleVectorInst *ShuffleInput, + MutableArrayRef UserShuffleMask, int IndexStart, + const TargetTransformInfo &TTI, + TTI::TargetCostKind CostKind) { + auto *VecTy = cast(ShuffleInput->getType()); + unsigned Width = VecTy->getNumElements(); + + // Collect only the shuffle mask elements that are actually used. + SmallVector CompactedMask; + // Map from original element index to compacted index. + SmallVector IndexRemap(Width, -1); + + // Track whether used elements are already compacted at the front. Even if + // true, we may still shrink this operand by not re-adding trailing poison. + bool AlreadyCompacted = true; + + // This modifies UserShuffleMask, so we cannot back out of transforming the + // operand while proceeding with compactShuffleOperands on the instruction. + for (int &MaskElt : UserShuffleMask) { + if (MaskElt >= IndexStart && MaskElt < IndexStart + (int)Width) { + int RelMaskElt = MaskElt - IndexStart; + if (IndexRemap[RelMaskElt] < 0) { + IndexRemap[RelMaskElt] = CompactedMask.size() + IndexStart; + CompactedMask.push_back(ShuffleInput->getMaskValue(RelMaskElt)); + } + if (IndexRemap[RelMaskElt] != MaskElt) { + AlreadyCompacted = false; + MaskElt = IndexRemap[RelMaskElt]; + } + } + } + + unsigned CompactedWidth = CompactedMask.size(); + + // Check if the compacted shuffle would be more expensive than the original. + InstructionCost CompactionCost(0); + if (!AlreadyCompacted) { + ArrayRef OriginalMask = ShuffleInput->getShuffleMask(); + auto *OriginalSrcTy = + cast(ShuffleInput->getOperand(0)->getType()); + + InstructionCost OriginalCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, + OriginalSrcTy, OriginalMask, CostKind); + + // Create a type for the compacted shuffle result. + auto *CompactedDstTy = + FixedVectorType::get(VecTy->getElementType(), CompactedWidth); + + InstructionCost CompactedCost = TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteTwoSrc, CompactedDstTy, OriginalSrcTy, + CompactedMask, CostKind); + + CompactionCost = CompactedCost - OriginalCost; + } + + // To determine the eventual width (between CompactedWidth and Width), we have + // to consider the other operand. Hence, we return a functor here to delay. + return {CompactionCost, CompactedWidth, + [ShuffleInput, AlreadyCompacted, Width, + CompactedMask = std::move(CompactedMask)]( + unsigned PaddedWidth, + IRBuilder &Builder) -> Value * { + // Return original if unchanged to guarantee fixpoint termination. + if (AlreadyCompacted && Width == PaddedWidth) + return ShuffleInput; + + // Pad with poison mask elements to reach the requested width. + SmallVector PaddedMask(CompactedMask); + while (PaddedMask.size() < PaddedWidth) + PaddedMask.push_back(PoisonMaskElem); + + return Builder.CreateShuffleVector(ShuffleInput->getOperand(0), + ShuffleInput->getOperand(1), + PaddedMask); + }}; +} + +/// Try to narrow/compact a shuffle operand by eliminating elements that are +/// not used by the shuffle mask. This updates the shuffle mask in-place to +/// reflect the compaction. Returns information about whether compaction is +/// possible and a lambda to apply the compaction if beneficial. +static ShuffleOperandCompaction +compactShuffleOperand(Value *ShuffleInput, MutableArrayRef ShuffleMask, + int IndexStart, const TargetTransformInfo &TTI, + TTI::TargetCostKind CostKind) { + auto *VecTy = cast(ShuffleInput->getType()); + unsigned Width = VecTy->getNumElements(); + if (ShuffleInput->getNumUses() > 1) + return {0, Width, nullptr}; + + if (auto *C = dyn_cast(ShuffleInput)) + return compactShuffleOperand(C, ShuffleMask, IndexStart); + if (auto *Shuf = dyn_cast(ShuffleInput)) + return compactShuffleOperand(Shuf, ShuffleMask, IndexStart, TTI, CostKind); + + return {0, Width, nullptr}; +} + +/// Try to narrow the shuffle by eliminating unused elements from the operands. +bool VectorCombine::compactShuffleOperands(Instruction &I) { + Value *LHS, *RHS; + ArrayRef Mask; + if (!match(&I, m_Shuffle(m_Value(LHS), m_Value(RHS), m_Mask(Mask)))) + return false; + + // Require at least one constant operand to ensure profitability. + if (!isa(LHS) && !isa(RHS)) + return false; + + auto *LHSTy = dyn_cast(LHS->getType()); + if (!LHSTy) + return false; + + // Analyze both operands. This updates NewMask in-place to reflect compaction. + unsigned LHSWidth = LHSTy->getNumElements(); + SmallVector NewMask(Mask.begin(), Mask.end()); + ShuffleOperandCompaction LHSCompact = + compactShuffleOperand(LHS, NewMask, 0, TTI, CostKind); + ShuffleOperandCompaction RHSCompact = + compactShuffleOperand(RHS, NewMask, LHSWidth, TTI, CostKind); + + unsigned CompactedWidth = + std::max(LHSCompact.CompactedWidth, RHSCompact.CompactedWidth); + + // Check total cost: compacting operands + change to outer shuffle. + if (LHSCompact.Apply || RHSCompact.Apply) { + auto *ShuffleDstTy = cast(I.getType()); + InstructionCost CostBefore = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + LHSTy, Mask, CostKind, 0, nullptr, {LHS, RHS}, &I); + + InstructionCost CostAfter = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, ShuffleDstTy, + LHSTy, NewMask, CostKind); + + InstructionCost OuterCost = CostAfter - CostBefore; + + if (OuterCost + LHSCompact.Cost + RHSCompact.Cost > 0) + return false; + } else if (CompactedWidth == LHSWidth) + return false; + + Value *NewLHS = + LHSCompact.Apply ? LHSCompact.Apply(CompactedWidth, Builder) : LHS; + Value *NewRHS = + RHSCompact.Apply ? RHSCompact.Apply(CompactedWidth, Builder) : RHS; + + // Ensure we terminate from the optimization fixpoint loop eventually. + if (LHS == NewLHS && RHS == NewRHS) + return false; + + // Adjust RHS indices in the mask to account for the new LHS width. + for (int &MaskElt : NewMask) + if (MaskElt >= (int)LHSWidth) + MaskElt = MaskElt - LHSWidth + CompactedWidth; + + Value *NewShuf = Builder.CreateShuffleVector(NewLHS, NewRHS, NewMask); + replaceValue(I, *NewShuf); + return true; +} + /// Try to convert any of: /// "shuffle (shuffle x, y), (shuffle y, x)" /// "shuffle (shuffle x, undef), (shuffle y, undef)" @@ -5034,6 +5268,8 @@ bool VectorCombine::run() { return true; if (foldShuffleToIdentity(I)) return true; + if (compactShuffleOperands(I)) + return true; break; case Instruction::Load: if (shrinkLoadForShuffles(I)) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll index de64bf2657f72..e3c1318278d38 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll @@ -334,8 +334,7 @@ define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[VECINSERT21]] ; %1 = extractelement <4 x float> %A, i32 2 @@ -344,7 +343,7 @@ define <4 x float> @test_addsub_v4f32_partial_23(<4 x float> %A, <4 x float> %B) %3 = extractelement <4 x float> %A, i32 3 %4 = extractelement <4 x float> %B, i32 3 %add2 = fadd float %3, %4 - %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2 + %vecinsert1 = insertelement <4 x float> poison, float %sub2, i32 2 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3 ret <4 x float> %vecinsert2 } @@ -353,8 +352,7 @@ define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) ; CHECK-LABEL: @test_addsub_v4f32_partial_03( ; CHECK-NEXT: [[FOLDEXTEXTBINOP:%.*]] = fsub <4 x float> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[FOLDEXTEXTBINOP2:%.*]] = fadd <4 x float> [[A]], [[B]] -; CHECK-NEXT: [[VECINSERT1:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> , <4 x i32> -; CHECK-NEXT: [[VECINSERT2:%.*]] = shufflevector <4 x float> [[VECINSERT1]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> +; CHECK-NEXT: [[VECINSERT2:%.*]] = shufflevector <4 x float> [[FOLDEXTEXTBINOP]], <4 x float> [[FOLDEXTEXTBINOP2]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[VECINSERT2]] ; %1 = extractelement <4 x float> %A, i32 0 @@ -363,7 +361,7 @@ define <4 x float> @test_addsub_v4f32_partial_03(<4 x float> %A, <4 x float> %B) %3 = extractelement <4 x float> %A, i32 3 %4 = extractelement <4 x float> %B, i32 3 %add = fadd float %4, %3 - %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0 + %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 0 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3 ret <4 x float> %vecinsert2 } @@ -374,8 +372,7 @@ define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[VECINSERT21:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[VECINSERT21]] ; %1 = extractelement <4 x float> %A, i32 2 @@ -384,7 +381,7 @@ define <4 x float> @test_addsub_v4f32_partial_12(<4 x float> %A, <4 x float> %B) %3 = extractelement <4 x float> %A, i32 1 %4 = extractelement <4 x float> %B, i32 1 %add = fadd float %3, %4 - %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2 + %vecinsert1 = insertelement <4 x float> poison, float %sub, i32 2 %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1 ret <4 x float> %vecinsert2 } diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll index c5f56d3644c5f..6370e9ccb50db 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll @@ -419,11 +419,11 @@ define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <6 x i32> ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <6 x i32> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7 ; SSE-NEXT: [[ADD7:%.*]] = fadd double [[A7]], [[B7]] -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> , <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> , <8 x i32> ; SSE-NEXT: [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP8]], double [[ADD7]], i64 7 ; SSE-NEXT: ret <8 x double> [[VECINSERT8]] ; @@ -934,11 +934,11 @@ define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <6 x i32> ; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <6 x i32> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i64 7 ; SSE-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i64 7 ; SSE-NEXT: [[ADD7:%.*]] = fsub double [[A7]], [[B7]] -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> , <8 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> , <8 x i32> ; SSE-NEXT: [[VECINSERT8:%.*]] = insertelement <8 x double> [[TMP8]], double [[ADD7]], i64 7 ; SSE-NEXT: ret <8 x double> [[VECINSERT8]] ; diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll index 7ffd0d29b4f05..5de2bb6515e15 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll @@ -1026,9 +1026,8 @@ define <4 x i64> @bitcast_smax_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b) { define void @bitcast_srcty_mismatch() { ; CHECK-LABEL: @bitcast_srcty_mismatch( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> zeroinitializer, <2 x i64> zeroinitializer, <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> zeroinitializer to <4 x float> -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[SHUFFLE_I_I]] to <4 x float> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> zeroinitializer to <4 x float> ; CHECK-NEXT: [[SHUFP_I196:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> ; CHECK-NEXT: store <4 x float> [[SHUFP_I196]], ptr null, align 16 ; CHECK-NEXT: ret void @@ -1064,8 +1063,8 @@ entry: define <16 x i64> @operandbundles(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) { ; CHECK-LABEL: @operandbundles( ; CHECK-NEXT: [[CALL:%.*]] = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i64> [[C:%.*]]) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] -; CHECK-NEXT: [[SHUFFLEVECTOR:%.*]] = shufflevector <4 x i64> [[CALL]], <4 x i64> poison, <16 x i32> -; CHECK-NEXT: [[SHUFFLEVECTOR1:%.*]] = shufflevector <16 x i64> [[SHUFFLEVECTOR]], <16 x i64> undef, <16 x i32> +; CHECK-NEXT: [[SHUFFLEVECTOR:%.*]] = shufflevector <4 x i64> [[CALL]], <4 x i64> poison, <12 x i32> +; CHECK-NEXT: [[SHUFFLEVECTOR1:%.*]] = shufflevector <12 x i64> [[TMP1]], <12 x i64> undef, <16 x i32> ; CHECK-NEXT: ret <16 x i64> [[SHUFFLEVECTOR1]] ; %call = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c) [ "jl_roots"(ptr addrspace(10) null, ptr addrspace(10) null) ] diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shufflevec-compact-operands.ll b/llvm/test/Transforms/VectorCombine/AArch64/shufflevec-compact-operands.ll new file mode 100644 index 0000000000000..7141808658ad1 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/AArch64/shufflevec-compact-operands.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=vector-combine %s -S -o - | FileCheck %s + +target triple = "aarch64" + +; Interleaving splat shuffle with constant operand - SHOULD compact +define <8 x i8> @interleave_splat_constant(i8 %x) { +; CHECK-LABEL: @interleave_splat_constant( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> , <4 x i8> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = insertelement <4 x i8> poison, i8 %x, i32 0 + %2 = shufflevector <4 x i8> %1, <4 x i8> poison, <4 x i32> zeroinitializer + %3 = shufflevector <4 x i8> %2, <4 x i8> poison, <8 x i32> + %4 = shufflevector <8 x i8> , <8 x i8> %3, <8 x i32> + ret <8 x i8> %4 +} + +; Interleaving constant with splat shuffle operand - SHOULD compact +define <8 x i8> @interleave_constant_splat(i8 %x) { +; CHECK-LABEL: @interleave_constant_splat( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> , <4 x i8> [[TMP2]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = insertelement <4 x i8> poison, i8 %x, i32 0 + %2 = shufflevector <4 x i8> %1, <4 x i8> poison, <4 x i32> zeroinitializer + %3 = shufflevector <4 x i8> %2, <4 x i8> poison, <8 x i32> + %4 = shufflevector <8 x i8> , <8 x i8> %3, <8 x i32> + ret <8 x i8> %4 +} + +; Interleaving random shuffle with constant operand - SHOULD compact +define <8 x i8> @interleave_shuffle_constant(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @interleave_shuffle_constant( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> %1, <8 x i8> , <8 x i32> + ret <8 x i8> %2 +} + +; Interleaving constant operand with random shuffle - SHOULD compact +define <8 x i8> @interleave_constant_shuffle(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @interleave_constant_shuffle( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> , <4 x i8> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> , <8 x i8> %1, <8 x i32> + ret <8 x i8> %2 +} + +; Randomly shuffle random shuffle with constant operand - SHOULD compact +define <8 x i8> @shuffle_shuffle_constant(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @shuffle_shuffle_constant( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <5 x i8> [[TMP1]], <5 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> %1, <8 x i8> , <8 x i32> + ret <8 x i8> %2 +} + +; Randomly shuffle constant operand with random shuffle - SHOULD compact +define <8 x i8> @shuffle_constant_shuffle(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @shuffle_constant_shuffle( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x i8> , <5 x i8> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP2]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> , <8 x i8> %1, <8 x i32> + ret <8 x i8> %2 +} + +; Randomly shuffle interleave shuffle with constant operand - does NOT compact +define <8 x i8> @shuffle_interleave_constant(<8 x i8> %x, <8 x i8> %y) { +; CHECK-LABEL: @shuffle_interleave_constant( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> [[Y:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> %1, <8 x i8> , <8 x i32> + ret <8 x i8> %2 +} + +; Randomly shuffle constant operand with interleave shuffle - does NOT compact +define <8 x i8> @shuffle_constant_interleave(<8 x i8> %x, <8 x i8> %y) { +; CHECK-LABEL: @shuffle_constant_interleave( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> [[Y:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP2]] +; + %1 = shufflevector <8 x i8> %x, <8 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> , <8 x i8> %1, <8 x i32> + ret <8 x i8> %2 +} + +; Both operands are shuffles - does NOT compact +define <8 x i32> @interleave_shuffle_shuffle(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { +; CHECK-LABEL: @interleave_shuffle_shuffle( +; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[C:%.*]], <8 x i32> +; CHECK-NEXT: [[S2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[C]], <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[S1]], <8 x i32> [[S2]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %s1 = shufflevector <4 x i32> %a, <4 x i32> %c, <8 x i32> + %s2 = shufflevector <4 x i32> %b, <4 x i32> %c, <8 x i32> + %result = shufflevector <8 x i32> %s1, <8 x i32> %s2, <8 x i32> + ret <8 x i32> %result +} + +; Multiple uses of LHS (shufflevector) operand - does NOT compact +define <8 x i8> @shuffle_multiple_users_shuffle_constant(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @shuffle_multiple_users_shuffle_constant( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <8 x i32> +; CHECK-NEXT: call void @use_vec(<8 x i8> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + call void @use_vec(<8 x i8> %1) + %2 = shufflevector <8 x i8> %1, <8 x i8> , <8 x i32> + ret <8 x i8> %2 +} + +; Interleaving non-compactible operand with constant operand - does NOT compact +define <8 x i8> @interleave_argument_constant(<8 x i8> %x) { +; CHECK-LABEL: @interleave_argument_constant( +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <8 x i8> %x, <8 x i8> , <8 x i32> + ret <8 x i8> %1 +} + +; Interleaving constant operand with non-compactible operand - does NOT compact +define <8 x i8> @interleave_constant_argument(<8 x i8> %x) { +; CHECK-LABEL: @interleave_constant_argument( +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> , <8 x i8> [[X:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <8 x i8> , <8 x i8> %x, <8 x i32> + ret <8 x i8> %1 +} + +; Randomly shuffle non-compactible operand with constant operand - SHOULD compact +define <8 x i8> @shuffle_argument_constant(<8 x i8> %x) { +; CHECK-LABEL: @shuffle_argument_constant( +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[X:%.*]], <8 x i8> , <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <8 x i8> %x, <8 x i8> , <8 x i32> + ret <8 x i8> %1 +} + +; Randomly shuffle constant operand with non-compactible operand - SHOULD compact +define <8 x i8> @shuffle_constant_argument(<8 x i8> %x) { +; CHECK-LABEL: @shuffle_constant_argument( +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> , <8 x i8> [[X:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i8> [[TMP3]] +; + %1 = shufflevector <8 x i8> , <8 x i8> %x, <8 x i32> + ret <8 x i8> %1 +} + +; Different element type (f32) - SHOULD compact +define <8 x float> @shuffle_shuffle_constant_float(<4 x float> %x, <4 x float> %y) { +; CHECK-LABEL: @shuffle_shuffle_constant_float( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> , <8 x i32> +; CHECK-NEXT: ret <8 x float> [[TMP2]] +; + %1 = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> + %2 = shufflevector <8 x float> %1, <8 x float> , <8 x i32> + ret <8 x float> %2 +} + +; Values from the operands are duplicated by the shuffle - SHOULD compact +define <16 x i8> @shuffle_shuffle_constant_repeated(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @shuffle_shuffle_constant_repeated( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <7 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <7 x i8> [[TMP1]], <7 x i8> , <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> %1, <8 x i8> , <16 x i32> + ret <16 x i8> %2 +} + +; Values from the operands are duplicated by the shuffle - SHOULD compact +define <16 x i8> @shuffle_constant_shuffle_repeated(<4 x i8> %x, <4 x i8> %y) { +; CHECK-LABEL: @shuffle_constant_shuffle_repeated( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[X:%.*]], <4 x i8> [[Y:%.*]], <7 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <7 x i8> , <7 x i8> [[TMP1]], <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; + %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> + %2 = shufflevector <8 x i8> , <8 x i8> %1, <16 x i32> + ret <16 x i8> %2 +} + +declare void @use_vec(<8 x i8>) diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index 4c1ca82b2bd06..c3513c478f065 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -450,16 +450,10 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @PR34724( ; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 ; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] -; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1:%.*]], <4 x i32> +; CHECK-NEXT: [[B:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B1]], <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] -; CHECK-NEXT: [[V1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[V3]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 @@ -475,7 +469,7 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { %b01 = fadd float %b0, %b1 %b23 = fadd float %b2, %b3 - %v1 = insertelement <4 x float> undef, float %a23, i32 1 + %v1 = insertelement <4 x float> poison, float %a23, i32 1 %v2 = insertelement <4 x float> %v1, float %b01, i32 2 %v3 = insertelement <4 x float> %v2, float %b23, i32 3 ret <4 x float> %v3 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll index 228f161698bb2..51d608096398e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-insert-poison.ll @@ -67,10 +67,14 @@ define <4 x double> @src_ins0_v4f64_ext1_v2f64(<4 x double> %a, <2 x double> %b) } define <4 x double> @src_ins1_v4f64_ext1_v2f64(<4 x double> %a, <2 x double> %b) #0 { -; CHECK-LABEL: @src_ins1_v4f64_ext1_v2f64( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: ret <4 x double> [[INS]] +; SSE-LABEL: @src_ins1_v4f64_ext1_v2f64( +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[INS:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> +; SSE-NEXT: ret <4 x double> [[INS]] +; +; AVX-LABEL: @src_ins1_v4f64_ext1_v2f64( +; AVX-NEXT: [[INS:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <4 x i32> +; AVX-NEXT: ret <4 x double> [[INS]] ; %ext = extractelement <2 x double> %b, i32 1 %ins = insertelement <4 x double> poison, double %ext, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll index 5373f6c07be31..fd099957791e3 100644 --- a/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll +++ b/llvm/test/Transforms/VectorCombine/X86/permute-of-binops.ll @@ -54,7 +54,7 @@ define <4 x double> @fadd_v4f64_mixed_types(<4 x double> %a, <2 x double> %b) { define <4 x float> @fadd_v4f32_mixed_types(<4 x float> %a0) { ; CHECK-LABEL: define <4 x float> @fadd_v4f32_mixed_types( ; CHECK-SAME: <4 x float> [[A0:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> , <4 x i32> ; CHECK-NEXT: [[POST:%.*]] = fmul <4 x float> [[TMP1]], ; CHECK-NEXT: ret <4 x float> [[POST]] ; diff --git a/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll index a0945ab81b0f7..102fc898a8b3d 100644 --- a/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll +++ b/llvm/test/Transforms/VectorCombine/X86/reduction-two-vecs-combine.ll @@ -5,8 +5,7 @@ define i16 @test_spill_mixed() { ; CHECK-LABEL: define i16 @test_spill_mixed() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]]) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer) ; CHECK-NEXT: ret i16 0 ; entry: