-
Notifications
You must be signed in to change notification settings - Fork 17.7k
[SLP] Initial vectorization of non-power-of-2 ops. #77790
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 38 commits
252567a
0bb957b
84cf9b9
552b8aa
0ee85a3
4bb53dd
627c30b
cabbe05
f30c753
f15ddd9
82efe8a
35fc0f9
5cd569b
e189eec
e0b403a
b6dac7b
13db21f
8e7339a
3eacfa6
0d62c2c
8b6b0e8
454acf8
1576b0a
d733a61
4d8c47d
0103a25
de3a7e8
fb1c7be
4c1197a
6757ddf
210210f
47df498
981a3d4
a0155f1
6e4996a
cded768
c52b68c
8d1b5d4
db8bb3f
b7ccdd4
8c9627d
3919ee6
ad67f18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -190,6 +190,10 @@ static cl::opt<bool> | |||||||||||
| ViewSLPTree("view-slp-tree", cl::Hidden, | ||||||||||||
| cl::desc("Display the SLP trees with Graphviz")); | ||||||||||||
|
|
||||||||||||
| static cl::opt<bool> VectorizeNonPowerOf2( | ||||||||||||
| "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, | ||||||||||||
| cl::desc("Try to vectorize with non-power-of-2 number of elements.")); | ||||||||||||
|
|
||||||||||||
| // Limit the number of alias checks. The limit is chosen so that | ||||||||||||
| // it has no negative effect on the llvm benchmarks. | ||||||||||||
| static const unsigned AliasedCheckLimit = 10; | ||||||||||||
|
|
@@ -2806,6 +2810,9 @@ class BoUpSLP { | |||||||||||
| SmallVectorImpl<Value *> *OpScalars = nullptr, | ||||||||||||
| SmallVectorImpl<Value *> *AltScalars = nullptr) const; | ||||||||||||
|
|
||||||||||||
| /// Return true if this is a non-power-of-2 node. | ||||||||||||
| bool isNonPowOf2Vec() const { return !isPowerOf2_32(Scalars.size()); } | ||||||||||||
|
|
||||||||||||
| #ifndef NDEBUG | ||||||||||||
| /// Debug printer. | ||||||||||||
| LLVM_DUMP_METHOD void dump() const { | ||||||||||||
|
|
@@ -2971,9 +2978,11 @@ class BoUpSLP { | |||||||||||
| MustGather.insert(VL.begin(), VL.end()); | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| if (UserTreeIdx.UserTE) | ||||||||||||
| if (UserTreeIdx.UserTE) { | ||||||||||||
| Last->UserTreeIndices.push_back(UserTreeIdx); | ||||||||||||
|
|
||||||||||||
| assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) && | ||||||||||||
| "Reordering isn't implemented for non-power-of-2 nodes yet"); | ||||||||||||
| } | ||||||||||||
| return Last; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
|
|
@@ -4224,6 +4233,13 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( | |||||||||||
| auto *VecTy = FixedVectorType::get(ScalarTy, Sz); | ||||||||||||
| // Check the order of pointer operands or that all pointers are the same. | ||||||||||||
| bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); | ||||||||||||
| // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. | ||||||||||||
| if (!Order.empty() && !isPowerOf2_32(VL.size())) { | ||||||||||||
| assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only " | ||||||||||||
| "supported with VectorizeNonPowerOf2"); | ||||||||||||
| return LoadsState::Gather; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); | ||||||||||||
| if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) && | ||||||||||||
| TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && | ||||||||||||
|
|
@@ -4543,6 +4559,10 @@ static bool areTwoInsertFromSameBuildVector( | |||||||||||
|
|
||||||||||||
| std::optional<BoUpSLP::OrdersType> | ||||||||||||
| BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { | ||||||||||||
| // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. | ||||||||||||
| if (TE.isNonPowOf2Vec()) | ||||||||||||
| return std::nullopt; | ||||||||||||
|
|
||||||||||||
| // No need to reorder if need to shuffle reuses, still need to shuffle the | ||||||||||||
| // node. | ||||||||||||
| if (!TE.ReuseShuffleIndices.empty()) { | ||||||||||||
|
|
@@ -5117,6 +5137,10 @@ bool BoUpSLP::canReorderOperands( | |||||||||||
| TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, | ||||||||||||
| ArrayRef<TreeEntry *> ReorderableGathers, | ||||||||||||
| SmallVectorImpl<TreeEntry *> &GatherOps) { | ||||||||||||
| // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. | ||||||||||||
| if (UserTE->isNonPowOf2Vec()) | ||||||||||||
| return false; | ||||||||||||
|
|
||||||||||||
| for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) { | ||||||||||||
| if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) { | ||||||||||||
| return OpData.first == I && | ||||||||||||
|
|
@@ -5290,6 +5314,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { | |||||||||||
| } | ||||||||||||
| auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0)); | ||||||||||||
| const auto AllowsReordering = [&](const TreeEntry *TE) { | ||||||||||||
| // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet. | ||||||||||||
| if (TE->isNonPowOf2Vec()) | ||||||||||||
| return false; | ||||||||||||
| if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || | ||||||||||||
| (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || | ||||||||||||
| (IgnoreReorder && TE->Idx == 0)) | ||||||||||||
|
|
@@ -5805,7 +5832,10 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( | |||||||||||
| case Instruction::ExtractValue: | ||||||||||||
| case Instruction::ExtractElement: { | ||||||||||||
| bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); | ||||||||||||
| if (Reuse || !CurrentOrder.empty()) | ||||||||||||
| // FIXME: Vectorizing is not supported yet for non-power-of-2 ops. | ||||||||||||
| if (!isPowerOf2_32(VL.size())) | ||||||||||||
| return TreeEntry::NeedToGather; | ||||||||||||
| if ((Reuse || !CurrentOrder.empty())) | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove extra parens, restore original check here
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||||||||||||
| return TreeEntry::Vectorize; | ||||||||||||
| LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); | ||||||||||||
| return TreeEntry::NeedToGather; | ||||||||||||
|
|
@@ -6111,6 +6141,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, | |||||||||||
| if (NumUniqueScalarValues == VL.size()) { | ||||||||||||
| ReuseShuffleIndicies.clear(); | ||||||||||||
| } else { | ||||||||||||
| if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) { | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add FIXME here for non-power-of-2 support
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added, thanks! |
||||||||||||
| LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " | ||||||||||||
| "for nodes with padding.\n"); | ||||||||||||
| newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); | ||||||||||||
| return false; | ||||||||||||
| } | ||||||||||||
| LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); | ||||||||||||
| if (NumUniqueScalarValues <= 1 || | ||||||||||||
| (UniquePositions.size() == 1 && all_of(UniqueValues, | ||||||||||||
|
|
@@ -7724,7 +7760,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { | |||||||||||
| for (unsigned I = 0, End = VL.size(); I < End; I += VF) { | ||||||||||||
| if (VectorizedLoads.contains(VL[I])) | ||||||||||||
| continue; | ||||||||||||
| GatherCost += getBuildVectorCost(VL.slice(I, VF), Root); | ||||||||||||
| GatherCost += | ||||||||||||
| getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); | ||||||||||||
| } | ||||||||||||
| // Exclude potentially vectorized loads from list of gathered | ||||||||||||
| // scalars. | ||||||||||||
|
|
@@ -10503,6 +10540,9 @@ BoUpSLP::isGatherShuffledEntry( | |||||||||||
| // No need to check for the topmost gather node. | ||||||||||||
| if (TE == VectorizableTree.front().get()) | ||||||||||||
| return {}; | ||||||||||||
| // FIXME: Gathering for non-power-of-2 nodes not implemented yet. | ||||||||||||
| if (TE->isNonPowOf2Vec()) | ||||||||||||
| return {}; | ||||||||||||
| Mask.assign(VL.size(), PoisonMaskElem); | ||||||||||||
| assert(TE->UserTreeIndices.size() == 1 && | ||||||||||||
| "Expected only single user of the gather node."); | ||||||||||||
|
|
@@ -14698,8 +14738,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, | |||||||||||
| const unsigned Sz = R.getVectorElementSize(Chain[0]); | ||||||||||||
| unsigned VF = Chain.size(); | ||||||||||||
|
|
||||||||||||
| if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) | ||||||||||||
| return false; | ||||||||||||
| if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) { | ||||||||||||
| // Check if vectorizing with a non-power-of-2 VF should be considered. At | ||||||||||||
| // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost | ||||||||||||
| // all vector lanes are used. | ||||||||||||
| if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) | ||||||||||||
| return false; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx | ||||||||||||
| << "\n"); | ||||||||||||
|
|
@@ -14798,14 +14843,23 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, | |||||||||||
| continue; | ||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| std::optional<unsigned> NonPowerOf2VF; | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||||||||||||
| if (VectorizeNonPowerOf2) { | ||||||||||||
| // First try vectorizing with a non-power-of-2 VF. At the moment, only | ||||||||||||
| // consider cases where VF + 1 is a power-of-2, i.e. almost all vector | ||||||||||||
| // lanes are used. | ||||||||||||
| unsigned CandVF = Operands.size(); | ||||||||||||
| if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxVF) { | ||||||||||||
| NonPowerOf2VF = CandVF; | ||||||||||||
| } | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||||||||||||
| } | ||||||||||||
|
|
||||||||||||
| unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF); | ||||||||||||
| SmallVector<unsigned> CandidateVFs(Sz); | ||||||||||||
| // FIXME: Is division-by-2 the correct step? Should we assert that the | ||||||||||||
| // register size is a power-of-2? | ||||||||||||
| unsigned Size = MaxVF; | ||||||||||||
| for_each(CandidateVFs, [&](unsigned &VF) { | ||||||||||||
| VF = Size; | ||||||||||||
| Size /= 2; | ||||||||||||
| SmallVector<unsigned> CandidateVFs(Sz + bool(NonPowerOf2VF)); | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Better to avoid adding
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||||||||||||
| unsigned Size = MinVF; | ||||||||||||
| for_each(reverse(CandidateVFs), [&](unsigned &VF) { | ||||||||||||
| VF = Size > MaxVF ? *NonPowerOf2VF : Size; | ||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, thanks! |
||||||||||||
| Size *= 2; | ||||||||||||
| }); | ||||||||||||
| unsigned StartIdx = 0; | ||||||||||||
| for (unsigned Size : CandidateVFs) { | ||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What if ReuseShuffleIndices is not empty? Will it work?Can you add a test?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All code paths should guard against that AFAICT. I added an assertion to make sure. Couldn't find any test case that triggers this across large code bases (SPEC2006, SPEC2017, llvm-test-suite, clang bootstrap and large internal benchmarks)