diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index b79689c39ef84..83b463c630d71 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -85,6 +85,42 @@ static cl::opt "when sorting profitable allocas"), cl::init(4)); +// We support vector indices of the form (A * stride) + B +// All parts are optional. +struct GEPToVectorIndex { + Value *VarIndex = nullptr; // defaults to 0 + ConstantInt *VarMul = nullptr; // defaults to 1 + ConstantInt *ConstIndex = nullptr; // defaults to 0 + Value *Full = nullptr; +}; + +struct MemTransferInfo { + ConstantInt *SrcIndex = nullptr; + ConstantInt *DestIndex = nullptr; +}; + +// Analysis for planning the different strategies of alloca promotion. +struct AllocaAnalysis { + AllocaInst *Alloca = nullptr; + DenseSet Pointers; + SmallVector Uses; + unsigned Score = 0; + bool HaveSelectOrPHI = false; + struct { + FixedVectorType *Ty = nullptr; + SmallVector Worklist; + SmallVector UsersToRemove; + MapVector GEPVectorIdx; + MapVector TransferInfo; + } Vector; + struct { + bool Enable = false; + SmallVector Worklist; + } LDS; + + explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {} +}; + // Shared implementation which can do both promotion to vector and to LDS. class AMDGPUPromoteAllocaImpl { private: @@ -106,10 +142,7 @@ class AMDGPUPromoteAllocaImpl { std::pair getLocalSizeYZ(IRBuilder<> &Builder); Value *getWorkitemID(IRBuilder<> &Builder, unsigned N); - /// BaseAlloca is the alloca root the search started from. - /// Val may be that alloca or a recursive user of it. - bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val, - std::vector &WorkList) const; + bool collectAllocaUses(AllocaAnalysis &AA) const; /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand /// indices to an instruction with 2 pointer inputs (e.g. select, icmp). @@ -123,10 +156,12 @@ class AMDGPUPromoteAllocaImpl { bool hasSufficientLocalMem(const Function &F); FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; - bool tryPromoteAllocaToVector(AllocaInst &I); - bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); + void analyzePromoteToVector(AllocaAnalysis &AA) const; + void promoteAllocaToVector(AllocaAnalysis &AA); + void analyzePromoteToLDS(AllocaAnalysis &AA) const; + bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS); - void sortAllocasToPromote(SmallVectorImpl &Allocas); + void scoreAlloca(AllocaAnalysis &AA) const; void setFunctionLimits(const Function &F); @@ -237,53 +272,77 @@ FunctionPass *llvm::createAMDGPUPromoteAlloca() { return new AMDGPUPromoteAlloca(); } -static void collectAllocaUses(AllocaInst &Alloca, - SmallVectorImpl &Uses) { - SmallVector WorkList({&Alloca}); +bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const { + const auto RejectUser = [&](Instruction *Inst, Twine Msg) { + LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n" + << " " << *Inst << "\n"); + return false; + }; + + SmallVector WorkList({AA.Alloca}); while (!WorkList.empty()) { auto *Cur = WorkList.pop_back_val(); + if (find(AA.Pointers, Cur) != AA.Pointers.end()) + continue; + AA.Pointers.insert(Cur); for (auto &U : Cur->uses()) { - Uses.push_back(&U); + auto *Inst = cast(U.getUser()); + if (isa(Inst)) { + if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) { + return RejectUser(Inst, "pointer escapes via store"); + } + } + AA.Uses.push_back(&U); + + if (isa(U.getUser())) { + WorkList.push_back(Inst); + } else if (auto *SI = dyn_cast(Inst)) { + // Only promote a select if we know that the other select operand is + // from another pointer that will also be promoted. + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2)) + return RejectUser(Inst, "select from mixed objects"); + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } else if (auto *Phi = dyn_cast(Inst)) { + // Repeat for phis. + + // TODO: Handle more complex cases. We should be able to replace loops + // over arrays. + switch (Phi->getNumIncomingValues()) { + case 1: + break; + case 2: + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1)) + return RejectUser(Inst, "phi from mixed objects"); + break; + default: + return RejectUser(Inst, "phi with too many operands"); + } - if (isa(U.getUser())) - WorkList.push_back(cast(U.getUser())); + WorkList.push_back(Inst); + AA.HaveSelectOrPHI = true; + } } } + return true; } -void AMDGPUPromoteAllocaImpl::sortAllocasToPromote( - SmallVectorImpl &Allocas) { - DenseMap Scores; - - for (auto *Alloca : Allocas) { - LLVM_DEBUG(dbgs() << "Scoring: " << *Alloca << "\n"); - unsigned &Score = Scores[Alloca]; - // Increment score by one for each user + a bonus for users within loops. - SmallVector Uses; - collectAllocaUses(*Alloca, Uses); - for (auto *U : Uses) { - Instruction *Inst = cast(U->getUser()); - if (isa(Inst)) - continue; - unsigned UserScore = - 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); - LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); - Score += UserScore; - } - LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); +void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const { + LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n"); + unsigned Score = 0; + // Increment score by one for each user + a bonus for users within loops. + for (auto *U : AA.Uses) { + Instruction *Inst = cast(U->getUser()); + if (isa(Inst) || isa(Inst) || + isa(Inst)) + continue; + unsigned UserScore = + 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent())); + LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n"); + Score += UserScore; } - - stable_sort(Allocas, [&](AllocaInst *A, AllocaInst *B) { - return Scores.at(A) > Scores.at(B); - }); - - // clang-format off - LLVM_DEBUG( - dbgs() << "Sorted Worklist:\n"; - for (auto *A: Allocas) - dbgs() << " " << *A << "\n"; - ); - // clang-format on + LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n"); + AA.Score = Score; } void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) { @@ -320,27 +379,48 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { : (MaxVGPRs * 32)) / VGPRBudgetRatio; - SmallVector Allocas; + std::vector Allocas; for (Instruction &I : F.getEntryBlock()) { if (AllocaInst *AI = dyn_cast(&I)) { // Array allocations are probably not worth handling, since an allocation // of the array type is the canonical form. if (!AI->isStaticAlloca() || AI->isArrayAllocation()) continue; - Allocas.push_back(AI); + + LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n'); + + AllocaAnalysis AA{AI}; + if (collectAllocaUses(AA)) { + analyzePromoteToVector(AA); + if (PromoteToLDS) + analyzePromoteToLDS(AA); + if (AA.Vector.Ty || AA.LDS.Enable) { + scoreAlloca(AA); + Allocas.push_back(std::move(AA)); + } + } } } - sortAllocasToPromote(Allocas); + stable_sort(Allocas, + [](const auto &A, const auto &B) { return A.Score > B.Score; }); + + // clang-format off + LLVM_DEBUG( + dbgs() << "Sorted Worklist:\n"; + for (const auto &AA : Allocas) + dbgs() << " " << *AA.Alloca << "\n"; + ); + // clang-format on bool Changed = false; - for (AllocaInst *AI : Allocas) { - const unsigned AllocaCost = DL->getTypeSizeInBits(AI->getAllocatedType()); - // First, check if we have enough budget to vectorize this alloca. - if (AllocaCost <= VectorizationBudget) { - // If we do, attempt vectorization, otherwise, fall through and try - // promoting to LDS instead. - if (tryPromoteAllocaToVector(*AI)) { + for (AllocaAnalysis &AA : Allocas) { + if (AA.Vector.Ty) { + const unsigned AllocaCost = + DL->getTypeSizeInBits(AA.Alloca->getAllocatedType()); + // First, check if we have enough budget to vectorize this alloca. + if (AllocaCost <= VectorizationBudget) { + promoteAllocaToVector(AA); Changed = true; assert((VectorizationBudget - AllocaCost) < VectorizationBudget && "Underflow!"); @@ -348,14 +428,14 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { LLVM_DEBUG(dbgs() << " Remaining vectorization budget:" << VectorizationBudget << "\n"); continue; + } else { + LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" + << AllocaCost << ", budget:" << VectorizationBudget + << "): " << *AA.Alloca << "\n"); } - } else { - LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:" - << AllocaCost << ", budget:" << VectorizationBudget - << "): " << *AI << "\n"); } - if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS)) + if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS)) Changed = true; } @@ -366,11 +446,6 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) { return Changed; } -struct MemTransferInfo { - ConstantInt *SrcIndex = nullptr; - ConstantInt *DestIndex = nullptr; -}; - // Checks if the instruction I is a memset user of the alloca AI that we can // deal with. Currently, only non-volatile memsets that affect the whole alloca // are handled. @@ -388,23 +463,48 @@ static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile(); } -static Value *calculateVectorIndex( - Value *Ptr, const std::map &GEPIdx) { - auto *GEP = dyn_cast(Ptr->stripPointerCasts()); - if (!GEP) - return ConstantInt::getNullValue(Type::getInt32Ty(Ptr->getContext())); +static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) { + IRBuilder<> B(Ptr->getContext()); + + Ptr = Ptr->stripPointerCasts(); + if (Ptr == AA.Alloca) + return B.getInt32(0); + + auto *GEP = cast(Ptr); + auto I = AA.Vector.GEPVectorIdx.find(GEP); + assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!"); - auto I = GEPIdx.find(GEP); - assert(I != GEPIdx.end() && "Must have entry for GEP!"); + if (!I->second.Full) { + Value *Result = nullptr; + B.SetInsertPoint(GEP); - Value *IndexValue = I->second; - assert(IndexValue && "index value missing from GEP index map"); - return IndexValue; + if (I->second.VarIndex) { + Result = I->second.VarIndex; + Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty()); + + if (I->second.VarMul) + Result = B.CreateMul(Result, I->second.VarMul); + } + + if (I->second.ConstIndex) { + if (Result) + Result = B.CreateAdd(Result, I->second.ConstIndex); + else + Result = I->second.ConstIndex; + } + + if (!Result) + Result = B.getInt32(0); + + I->second.Full = Result; + } + + return I->second.Full; } -static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, - Type *VecElemTy, const DataLayout &DL, - SmallVector &NewInsts) { +static std::optional +computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, + Type *VecElemTy, const DataLayout &DL) { // TODO: Extracting a "multiple of X" from a GEP might be a useful generic // helper. LLVMContext &Ctx = GEP->getContext(); @@ -432,7 +532,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Value *CurPtr = GEP; while (auto *CurGEP = dyn_cast(CurPtr)) { if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset)) - return nullptr; + return {}; // Move to the next outer pointer. CurPtr = CurGEP->getPointerOperand(); @@ -442,69 +542,57 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy); if (VarOffsets.size() > 1) - return nullptr; + return {}; APInt IndexQuot; int64_t Rem; APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem); if (Rem != 0) - return nullptr; - if (VarOffsets.size() == 0) - return ConstantInt::get(Ctx, IndexQuot); + return {}; + + GEPToVectorIndex Result; + + if (!ConstOffset.isZero()) + Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); - IRBuilder<> Builder(GEP); + if (VarOffsets.empty()) + return Result; const auto &VarOffset = VarOffsets.front(); APInt OffsetQuot; APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); if (Rem != 0 || OffsetQuot.isZero()) - return nullptr; + return {}; - Value *Offset = VarOffset.first; - if (!isa(Offset->getType())) - return nullptr; + Result.VarIndex = VarOffset.first; + auto *OffsetType = dyn_cast(Result.VarIndex->getType()); + if (!OffsetType) + return {}; - Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW)); - if (Offset != VarOffset.first) - NewInsts.push_back(cast(Offset)); + if (!OffsetQuot.isOne()) + Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); - if (!OffsetQuot.isOne()) { - ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); - Offset = Builder.CreateMul(Offset, ConstMul); - if (Instruction *NewInst = dyn_cast(Offset)) - NewInsts.push_back(NewInst); - } - if (ConstOffset.isZero()) - return Offset; - - ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); - Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); - if (Instruction *NewInst = dyn_cast(IndexAdd)) - NewInsts.push_back(NewInst); - return IndexAdd; + return Result; } /// Promotes a single user of the alloca to a vector form. /// /// \param Inst Instruction to be promoted. /// \param DL Module Data Layout. -/// \param VectorTy Vectorized Type. +/// \param AA Alloca Analysis. /// \param VecStoreSize Size of \p VectorTy in bytes. /// \param ElementSize Size of \p VectorTy element type in bytes. -/// \param TransferInfo MemTransferInst info map. -/// \param GEPVectorIdx GEP -> VectorIdx cache. /// \param CurVal Current value of the vector (e.g. last stored value) /// \param[out] DeferredLoads \p Inst is added to this vector if it can't /// be promoted now. This happens when promoting requires \p /// CurVal, but \p CurVal is nullptr. /// \return the stored value if \p Inst would have written to the alloca, or /// nullptr otherwise. -static Value *promoteAllocaUserToVector( - Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, - unsigned VecStoreSize, unsigned ElementSize, - DenseMap &TransferInfo, - std::map &GEPVectorIdx, - function_ref GetCurVal) { +static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, + AllocaAnalysis &AA, + unsigned VecStoreSize, + unsigned ElementSize, + function_ref GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder Builder(Inst->getContext(), @@ -526,13 +614,13 @@ static Value *promoteAllocaUserToVector( Val, FixedVectorType::get(EltTy, NumPtrElts)); }; - Type *VecEltTy = VectorTy->getElementType(); + Type *VecEltTy = AA.Vector.Ty->getElementType(); switch (Inst->getOpcode()) { case Instruction::Load: { Value *CurVal = GetCurVal(); - Value *Index = calculateVectorIndex( - cast(Inst)->getPointerOperand(), GEPVectorIdx); + Value *Index = + calculateVectorIndex(cast(Inst)->getPointerOperand(), AA); // We're loading the full vector. Type *AccessTy = Inst->getType(); @@ -588,7 +676,7 @@ static Value *promoteAllocaUserToVector( // to know the current value. If this is a store of a single element, we // need to know the value. StoreInst *SI = cast(Inst); - Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx); + Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA); Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. @@ -598,9 +686,9 @@ static Value *promoteAllocaUserToVector( if (CI->isZeroValue() && AccessSize == VecStoreSize) { if (AccessTy->isPtrOrPtrVectorTy()) Val = CreateTempPtrIntCast(Val, AccessTy); - else if (VectorTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, VectorTy); - return Builder.CreateBitOrPointerCast(Val, VectorTy); + else if (AA.Vector.Ty->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AA.Vector.Ty); + return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty); } } @@ -609,7 +697,7 @@ static Value *promoteAllocaUserToVector( assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy))); const unsigned NumWrittenElts = AccessSize / DL.getTypeStoreSize(VecEltTy); - const unsigned NumVecElts = VectorTy->getNumElements(); + const unsigned NumVecElts = AA.Vector.Ty->getNumElements(); auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); @@ -640,14 +728,14 @@ static Value *promoteAllocaUserToVector( // For memcpy, we need to know curval. ConstantInt *Length = cast(MTI->getLength()); unsigned NumCopied = Length->getZExtValue() / ElementSize; - MemTransferInfo *TI = &TransferInfo[MTI]; + MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI]; unsigned SrcBegin = TI->SrcIndex->getZExtValue(); unsigned DestBegin = TI->DestIndex->getZExtValue(); SmallVector Mask; - for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) { + for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) { if (Idx >= DestBegin && Idx < DestBegin + NumCopied) { - Mask.push_back(SrcBegin < VectorTy->getNumElements() + Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements() ? SrcBegin++ : PoisonMaskElem); } else { @@ -676,14 +764,14 @@ static Value *promoteAllocaUserToVector( Elt = Builder.CreateBitCast(EltBytes, VecEltTy); } - return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); + return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt); } if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { Intr->replaceAllUsesWith( Builder.getIntN(Intr->getType()->getIntegerBitWidth(), - DL.getTypeAllocSize(VectorTy))); + DL.getTypeAllocSize(AA.Vector.Ty))); return nullptr; } } @@ -838,46 +926,32 @@ AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { return VectorTy; } -// FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { - LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n'); - - Type *AllocaTy = Alloca.getAllocatedType(); - FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy); - if (!VectorTy) - return false; +void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const { + if (AA.HaveSelectOrPHI) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n"); + return; + } - std::map GEPVectorIdx; - SmallVector WorkList; - SmallVector UsersToRemove; - SmallVector DeferredInsts; - SmallVector NewGEPInsts; - DenseMap TransferInfo; + Type *AllocaTy = AA.Alloca->getAllocatedType(); + AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy); + if (!AA.Vector.Ty) + return; const auto RejectUser = [&](Instruction *Inst, Twine Msg) { LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n" << " " << *Inst << "\n"); - for (auto *Inst : reverse(NewGEPInsts)) - Inst->eraseFromParent(); - return false; + AA.Vector.Ty = nullptr; }; - SmallVector Uses; - collectAllocaUses(Alloca, Uses); - - LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); - - Type *VecEltTy = VectorTy->getElementType(); + Type *VecEltTy = AA.Vector.Ty->getElementType(); unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); - for (auto *U : Uses) { + for (auto *U : AA.Uses) { Instruction *Inst = cast(U->getUser()); if (Value *Ptr = getLoadStorePointerOperand(Inst)) { - // This is a store of the pointer, not to the pointer. - if (isa(Inst) && - U->getOperandNo() != StoreInst::getPointerOperandIndex()) - return RejectUser(Inst, "pointer is being stored"); + assert(!isa(Inst) || + U->getOperandNo() == StoreInst::getPointerOperandIndex()); Type *AccessTy = getLoadStoreType(Inst); if (AccessTy->isAggregateType()) @@ -893,34 +967,35 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Ptr = Ptr->stripPointerCasts(); // Alloca already accessed as vector. - if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) == - DL->getTypeStoreSize(AccessTy)) { - WorkList.push_back(Inst); + if (Ptr == AA.Alloca && + DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) == + DL->getTypeStoreSize(AccessTy)) { + AA.Vector.Worklist.push_back(Inst); continue; } - if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL)) return RejectUser(Inst, "not a supported access type"); - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); + auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); - GEPVectorIdx[GEP] = Index; - UsersToRemove.push_back(Inst); + AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value()); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (MemSetInst *MSI = dyn_cast(Inst); - MSI && isSupportedMemset(MSI, &Alloca, *DL)) { - WorkList.push_back(Inst); + MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) { + AA.Vector.Worklist.push_back(Inst); continue; } @@ -933,31 +1008,32 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { return RejectUser(Inst, "mem transfer inst length is non-constant or " "not a multiple of the vector element size"); - if (TransferInfo.try_emplace(TransferInst).second) { - DeferredInsts.push_back(Inst); - WorkList.push_back(Inst); - } + auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * { + if (Ptr == AA.Alloca) + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); - auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * { - GetElementPtrInst *GEP = dyn_cast(Ptr); - if (Ptr != &Alloca && !GEPVectorIdx.count(GEP)) + GetElementPtrInst *GEP = cast(Ptr); + const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second; + if (GEPI.VarIndex) return nullptr; - - return dyn_cast(calculateVectorIndex(Ptr, GEPVectorIdx)); + if (GEPI.ConstIndex) + return GEPI.ConstIndex; + return ConstantInt::get(Ptr->getContext(), APInt(32, 0)); }; + MemTransferInfo *TI = + &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second; unsigned OpNum = U->getOperandNo(); - MemTransferInfo *TI = &TransferInfo[TransferInst]; if (OpNum == 0) { Value *Dest = TransferInst->getDest(); - ConstantInt *Index = getPointerIndexOfAlloca(Dest); + ConstantInt *Index = getConstIndexIntoAlloca(Dest); if (!Index) return RejectUser(Inst, "could not calculate constant dest index"); TI->DestIndex = Index; } else { assert(OpNum == 1); Value *Src = TransferInst->getSource(); - ConstantInt *Index = getPointerIndexOfAlloca(Src); + ConstantInt *Index = getConstIndexIntoAlloca(Src); if (!Index) return RejectUser(Inst, "could not calculate constant src index"); TI->SrcIndex = Index; @@ -967,7 +1043,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *Intr = dyn_cast(Inst)) { if (Intr->getIntrinsicID() == Intrinsic::objectsize) { - WorkList.push_back(Inst); + AA.Vector.Worklist.push_back(Inst); continue; } } @@ -976,56 +1052,59 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (isAssumeLikeIntrinsic(Inst)) { if (!Inst->use_empty()) return RejectUser(Inst, "assume-like intrinsic cannot have any users"); - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } if (isa(Inst) && all_of(Inst->users(), [](User *U) { return isAssumeLikeIntrinsic(cast(U)); })) { - UsersToRemove.push_back(Inst); + AA.Vector.UsersToRemove.push_back(Inst); continue; } return RejectUser(Inst, "unhandled alloca user"); } - while (!DeferredInsts.empty()) { - Instruction *Inst = DeferredInsts.pop_back_val(); - MemTransferInst *TransferInst = cast(Inst); - // TODO: Support the case if the pointers are from different alloca or - // from different address spaces. - MemTransferInfo &Info = TransferInfo[TransferInst]; - if (!Info.SrcIndex || !Info.DestIndex) - return RejectUser( - Inst, "mem transfer inst is missing constant src and/or dst index"); + // Follow-up check to ensure we've seen both sides of all transfer insts. + for (const auto &Entry : AA.Vector.TransferInfo) { + const MemTransferInfo &TI = Entry.second; + if (!TI.SrcIndex || !TI.DestIndex) + return RejectUser(Entry.first, + "mem transfer inst between different objects"); + AA.Vector.Worklist.push_back(Entry.first); } +} - LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> " - << *VectorTy << '\n'); - const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy); +void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) { + LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n'); + LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType() + << " -> " << *AA.Vector.Ty << '\n'); + const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty); + + Type *VecEltTy = AA.Vector.Ty->getElementType(); + const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; // Alloca is uninitialized memory. Imitate that by making the first value // undef. SSAUpdater Updater; - Updater.Initialize(VectorTy, "promotealloca"); + Updater.Initialize(AA.Vector.Ty, "promotealloca"); - BasicBlock *EntryBB = Alloca.getParent(); + BasicBlock *EntryBB = AA.Alloca->getParent(); BasicBlock::iterator InitInsertPos = - skipToNonAllocaInsertPt(*EntryBB, Alloca.getIterator()); - // Alloca memory is undefined to begin, not poison. - Value *AllocaInitValue = - new FreezeInst(PoisonValue::get(VectorTy), "", InitInsertPos); - AllocaInitValue->takeName(&Alloca); + skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator()); + IRBuilder<> Builder(&*InitInsertPos); + Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty)); + AllocaInitValue->takeName(AA.Alloca); - Updater.AddAvailableValue(EntryBB, AllocaInitValue); + Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue); // First handle the initial worklist, in basic block order. // // Insert a placeholder whenever we need the vector value at the top of a // basic block. SmallVector Placeholders; - forEachWorkListItem(WorkList, [&](Instruction *I) { + forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) { BasicBlock *BB = I->getParent(); auto GetCurVal = [&]() -> Value * { if (Value *CurVal = Updater.FindValueForBlock(BB)) @@ -1038,14 +1117,13 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { // placeholder that we will replace later. IRBuilder<> Builder(I); auto *Placeholder = cast(Builder.CreateFreeze( - PoisonValue::get(VectorTy), "promotealloca.placeholder")); + PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder")); Placeholders.push_back(Placeholder); return Placeholders.back(); }; - Value *Result = - promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize, - TransferInfo, GEPVectorIdx, GetCurVal); + Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize, + ElementSize, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); @@ -1057,25 +1135,22 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Placeholder->eraseFromParent(); } - // Delete all instructions. On the first pass, new dummy loads may have been - // added so we need to collect them too. - DenseSet InstsToDelete(WorkList.begin(), WorkList.end()); - for (Instruction *I : InstsToDelete) { + // Delete all instructions. + for (Instruction *I : AA.Vector.Worklist) { assert(I->use_empty()); I->eraseFromParent(); } // Delete all the users that are known to be removeable. - for (Instruction *I : reverse(UsersToRemove)) { + for (Instruction *I : reverse(AA.Vector.UsersToRemove)) { I->dropDroppableUses(); assert(I->use_empty()); I->eraseFromParent(); } // Alloca should now be dead too. - assert(Alloca.use_empty()); - Alloca.eraseFromParent(); - return true; + assert(AA.Alloca->use_empty()); + AA.Alloca->eraseFromParent(); } std::pair @@ -1249,61 +1324,78 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( return true; } -bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( - Value *BaseAlloca, Value *Val, std::vector &WorkList) const { +void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const { + if (DisablePromoteAllocaToLDS) { + LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); + return; + } - for (User *User : Val->users()) { - if (is_contained(WorkList, User)) - continue; + // Don't promote the alloca to LDS for shader calling conventions as the work + // item ID intrinsics are not supported for these calling conventions. + // Furthermore not all LDS is available for some of the stages. + const Function &ContainingFunction = *AA.Alloca->getFunction(); + CallingConv::ID CC = ContainingFunction.getCallingConv(); + + switch (CC) { + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + default: + LLVM_DEBUG( + dbgs() + << " promote alloca to LDS not supported with calling convention.\n"); + return; + } + + for (Use *Use : AA.Uses) { + auto *User = Use->getUser(); if (CallInst *CI = dyn_cast(User)) { if (!isCallPromotable(CI)) - return false; + return; - WorkList.push_back(User); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); continue; } Instruction *UseInst = cast(User); if (UseInst->getOpcode() == Instruction::PtrToInt) - return false; + return; if (LoadInst *LI = dyn_cast(UseInst)) { if (LI->isVolatile()) - return false; + return; continue; } if (StoreInst *SI = dyn_cast(UseInst)) { if (SI->isVolatile()) - return false; - - // Reject if the stored value is not the pointer operand. - if (SI->getPointerOperand() != Val) - return false; + return; continue; } if (AtomicRMWInst *RMW = dyn_cast(UseInst)) { if (RMW->isVolatile()) - return false; + return; continue; } if (AtomicCmpXchgInst *CAS = dyn_cast(UseInst)) { if (CAS->isVolatile()) - return false; + return; continue; } // Only promote a select if we know that the other select operand // is from another pointer that will also be promoted. if (ICmpInst *ICmp = dyn_cast(UseInst)) { - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1)) - return false; + if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1)) + return; // May need to rewrite constant operands. - WorkList.push_back(ICmp); + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(ICmp); continue; } @@ -1311,28 +1403,8 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // Be conservative if an address could be computed outside the bounds of // the alloca. if (!GEP->isInBounds()) - return false; - } else if (SelectInst *SI = dyn_cast(UseInst)) { - // Only promote a select if we know that the other select operand is from - // another pointer that will also be promoted. - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2)) - return false; - } else if (PHINode *Phi = dyn_cast(UseInst)) { - // Repeat for phis. - - // TODO: Handle more complex cases. We should be able to replace loops - // over arrays. - switch (Phi->getNumIncomingValues()) { - case 1: - break; - case 2: - if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1)) - return false; - break; - default: - return false; - } - } else if (!isa(User)) { + return; + } else if (!isa(User)) { // Do not promote vector/aggregate type instructions. It is hard to track // their users. @@ -1340,15 +1412,14 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( // // TODO: If we know the address is only observed through flat pointers, we // could still promote. - return false; + return; } - WorkList.push_back(User); - if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList)) - return false; + if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end()) + AA.LDS.Worklist.push_back(User); } - return true; + AA.LDS.Enable = true; } bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { @@ -1479,44 +1550,23 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS) { - LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n'); - - if (DisablePromoteAllocaToLDS) { - LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n"); - return false; - } - - const DataLayout &DL = Mod->getDataLayout(); - IRBuilder<> Builder(&I); - - const Function &ContainingFunction = *I.getFunction(); - CallingConv::ID CC = ContainingFunction.getCallingConv(); - - // Don't promote the alloca to LDS for shader calling conventions as the work - // item ID intrinsics are not supported for these calling conventions. - // Furthermore not all LDS is available for some of the stages. - switch (CC) { - case CallingConv::AMDGPU_KERNEL: - case CallingConv::SPIR_KERNEL: - break; - default: - LLVM_DEBUG( - dbgs() - << " promote alloca to LDS not supported with calling convention.\n"); - return false; - } + LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n'); // Not likely to have sufficient local memory for promotion. if (!SufficientLDS) return false; + const DataLayout &DL = Mod->getDataLayout(); + IRBuilder<> Builder(AA.Alloca); + + const Function &ContainingFunction = *AA.Alloca->getParent()->getParent(); const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; - Align Alignment = - DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType()); + Align Alignment = DL.getValueOrABITypeAlignment( + AA.Alloca->getAlign(), AA.Alloca->getAllocatedType()); // FIXME: This computed padding is likely wrong since it depends on inverse // usage order. @@ -1526,7 +1576,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment); uint32_t AllocSize = - WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType()); + WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType()); NewSize += AllocSize; if (NewSize > LocalMemLimit) { @@ -1537,24 +1587,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, CurrentLocalMemUsage = NewSize; - std::vector WorkList; - - if (!collectUsesWithPtrTypes(&I, &I, WorkList)) { - LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n"); - return false; - } - LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n"); - Function *F = I.getFunction(); + Function *F = AA.Alloca->getFunction(); - Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize); + Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize); GlobalVariable *GV = new GlobalVariable( *Mod, GVTy, false, GlobalValue::InternalLinkage, PoisonValue::get(GVTy), - Twine(F->getName()) + Twine('.') + I.getName(), nullptr, + Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); - GV->setAlignment(I.getAlign()); + GV->setAlignment(AA.Alloca->getAlign()); Value *TCntY, *TCntZ; @@ -1573,15 +1616,15 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID}; Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices); - I.mutateType(Offset->getType()); - I.replaceAllUsesWith(Offset); - I.eraseFromParent(); + AA.Alloca->mutateType(Offset->getType()); + AA.Alloca->replaceAllUsesWith(Offset); + AA.Alloca->eraseFromParent(); SmallVector DeferredIntrs; PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS); - for (Value *V : WorkList) { + for (Value *V : AA.LDS.Worklist) { CallInst *Call = dyn_cast(V); if (!Call) { if (ICmpInst *CI = dyn_cast(V)) { diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll index 7da441f2e79d2..7ebb4ca262614 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll @@ -12,8 +12,7 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], -1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void @@ -42,8 +41,7 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3 ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], -1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll index ab03177d1edc5..ae6157af2cf4c 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll @@ -1,14 +1,16 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s ; REQUIRES: asserts -; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4 +; CHECK-LABEL: Analyzing: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %simpleuser, align 8 ; CHECK-NEXT: => Final Score:1 +; CHECK-LABEL: Analyzing: %manyusers = alloca [4 x i64], align 4, addrspace(5) ; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4 -; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1 -; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4 -; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1 +; CHECK-NEXT: [+1]: store i64 %v0.add, ptr addrspace(5) %manyusers.1, align 8 +; CHECK-NEXT: [+1]: %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8 +; CHECK-NEXT: [+1]: store i64 %v1.add, ptr addrspace(5) %manyusers.2, align 8 +; CHECK-NEXT: [+1]: %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8 ; CHECK-NEXT: => Final Score:4 ; CHECK-NEXT: Sorted Worklist: ; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5) @@ -20,50 +22,52 @@ entry: ; should get a score of 4 %manyusers = alloca [4 x i64], align 4, addrspace(5) - store i32 42, ptr addrspace(5) %simpleuser + store i64 42, ptr addrspace(5) %simpleuser - %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2 - %v0 = load i8, ptr addrspace(5) %manyusers.1 - %v0.ext = zext i8 %v0 to i32 - store i32 %v0.ext, ptr addrspace(5) %manyusers.1 + %manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2 + %v0 = load i64, ptr addrspace(5) %manyusers.1 + %v0.add = add i64 %v0, 1 + store i64 %v0.add, ptr addrspace(5) %manyusers.1 - %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1 - %v1 = load i8, ptr addrspace(5) %manyusers.2 - %v1.ext = zext i8 %v0 to i32 - store i32 %v1.ext, ptr addrspace(5) %manyusers.2 + %manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1 + %v1 = load i64, ptr addrspace(5) %manyusers.2 + %v1.add = add i64 %v0, 1 + store i64 %v1.add, ptr addrspace(5) %manyusers.2 ret void } -; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) -; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4 -; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4 -; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4 -; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1 -; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4 -; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1 +; CHECK-LABEL: Analyzing: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: [+5]: store i64 32, ptr addrspace(5) %stack, align 8 +; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %stack, align 8 +; CHECK-NEXT: [+9]: store i64 32, ptr addrspace(5) %stack.1, align 8 +; CHECK-NEXT: [+5]: %outer = load i64, ptr addrspace(5) %stack.1, align 8 +; CHECK-NEXT: [+1]: store i64 64, ptr addrspace(5) %stack.2, align 8 +; CHECK-NEXT: [+9]: %inner = load i64, ptr addrspace(5) %stack.2, align 8 ; CHECK-NEXT: => Final Score:30 define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 { entry: ; should get a score of 1 %stack = alloca [4 x i64], align 4, addrspace(5) - %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4 - %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8 + %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16 - store i32 42, ptr addrspace(5) %stack + store i64 42, ptr addrspace(5) %stack br label %loop.outer loop.outer: - store i32 32, ptr addrspace(5) %stack - %outer.cmp = load i1, ptr addrspace(5) %stack.1 + store i64 32, ptr addrspace(5) %stack + %outer = load i64, ptr addrspace(5) %stack.1 br label %loop.inner loop.inner: - store i32 32, ptr addrspace(5) %stack.1 - %inner.cmp = load i1, ptr addrspace(5) %stack.2 + store i64 32, ptr addrspace(5) %stack.1 + %inner = load i64, ptr addrspace(5) %stack.2 + %inner.cmp = icmp sge i64 %inner, 0 br i1 %inner.cmp, label %loop.inner, label %loop.outer exit: - store i32 64, ptr addrspace(5) %stack.2 + store i64 64, ptr addrspace(5) %stack.2 ret void }