diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index bb95265a794a0..efd3664266dee 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -122,6 +122,7 @@ class AMDGPUPromoteAllocaImpl { /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); + FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const; bool tryPromoteAllocaToVector(AllocaInst &I); bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS); @@ -460,13 +461,15 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, return nullptr; Value *Offset = VarOffset.first; - auto *OffsetType = dyn_cast(Offset->getType()); - if (!OffsetType) + if (!isa(Offset->getType())) return nullptr; + Offset = Builder.CreateSExtOrTrunc(Offset, Builder.getIntNTy(BW)); + if (Offset != VarOffset.first) + NewInsts.push_back(cast(Offset)); + if (!OffsetQuot.isOne()) { - ConstantInt *ConstMul = - ConstantInt::get(Ctx, OffsetQuot.sext(OffsetType->getBitWidth())); + ConstantInt *ConstMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW)); Offset = Builder.CreateMul(Offset, ConstMul); if (Instruction *NewInst = dyn_cast(Offset)) NewInsts.push_back(NewInst); @@ -474,8 +477,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, if (ConstOffset.isZero()) return Offset; - ConstantInt *ConstIndex = - ConstantInt::get(Ctx, IndexQuot.sext(OffsetType->getBitWidth())); + ConstantInt *ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW)); Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex); if (Instruction *NewInst = dyn_cast(IndexAdd)) NewInsts.push_back(NewInst); @@ -501,27 +503,14 @@ static Value *promoteAllocaUserToVector( Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap &TransferInfo, - std::map &GEPVectorIdx, Value *CurVal, - SmallVectorImpl &DeferredLoads) { + std::map &GEPVectorIdx, + function_ref GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { - if (CurVal) - return CurVal; - - // If the current value is not known, insert a dummy load and lower it on - // the second pass. - LoadInst *Dummy = - Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), - "promotealloca.dummyload"); - DeferredLoads.push_back(Dummy); - return Dummy; - }; - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, Type *PtrTy) -> Value * { assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); @@ -541,12 +530,7 @@ static Value *promoteAllocaUserToVector( switch (Inst->getOpcode()) { case Instruction::Load: { - // Loads can only be lowered if the value is known. - if (!CurVal) { - DeferredLoads.push_back(cast(Inst)); - return nullptr; - } - + Value *CurVal = GetCurVal(); Value *Index = calculateVectorIndex( cast(Inst)->getPointerOperand(), GEPVectorIdx); @@ -636,7 +620,7 @@ static Value *promoteAllocaUserToVector( Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - Value *CurVec = GetOrLoadCurrentVectorValue(); + Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = @@ -649,8 +633,7 @@ static Value *promoteAllocaUserToVector( if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); - return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, - Index); + return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast(Inst)) { @@ -672,7 +655,7 @@ static Value *promoteAllocaUserToVector( } } - return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast(Inst)) { @@ -791,16 +774,13 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } -// FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { - LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); - +FixedVectorType * +AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const { if (DisablePromoteAllocaToVector) { - LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n"); - return false; + LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n"); + return nullptr; } - Type *AllocaTy = Alloca.getAllocatedType(); auto *VectorTy = dyn_cast(AllocaTy); if (auto *ArrayTy = dyn_cast(AllocaTy)) { uint64_t NumElems = 1; @@ -832,10 +812,9 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } } } - if (!VectorTy) { LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); - return false; + return nullptr; } const unsigned MaxElements = @@ -845,9 +824,29 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { VectorTy->getNumElements() < 2) { LLVM_DEBUG(dbgs() << " " << *VectorTy << " has an unsupported number of elements\n"); - return false; + return nullptr; } + Type *VecEltTy = VectorTy->getElementType(); + unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); + if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " + "does not match the type's size\n"); + return nullptr; + } + + return VectorTy; +} + +// FIXME: Should try to pick the most likely to be profitable allocas first. +bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { + LLVM_DEBUG(dbgs() << "Trying to promote to vectors: " << Alloca << '\n'); + + Type *AllocaTy = Alloca.getAllocatedType(); + FixedVectorType *VectorTy = getVectorTypeForAlloca(AllocaTy); + if (!VectorTy) + return false; + std::map GEPVectorIdx; SmallVector WorkList; SmallVector UsersToRemove; @@ -869,13 +868,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); - if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { - LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " - "does not match the type's size\n"); - return false; - } - unsigned ElementSize = ElementSizeInBits / 8; + unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8; assert(ElementSize > 0); for (auto *U : Uses) { Instruction *Inst = cast(U->getUser()); @@ -1027,37 +1020,44 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Updater.AddAvailableValue(EntryBB, AllocaInitValue); - // First handle the initial worklist. - SmallVector DeferredLoads; + // First handle the initial worklist, in basic block order. + // + // Insert a placeholder whenever we need the vector value at the top of a + // basic block. + SmallVector Placeholders; forEachWorkListItem(WorkList, [&](Instruction *I) { BasicBlock *BB = I->getParent(); - // On the first pass, we only take values that are trivially known, i.e. - // where AddAvailableValue was already called in this block. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.FindValueForBlock(BB), DeferredLoads); + auto GetCurVal = [&]() -> Value * { + if (Value *CurVal = Updater.FindValueForBlock(BB)) + return CurVal; + + // If the current value in the basic block is not yet known, insert a + // placeholder that we will replace later. + IRBuilder<> Builder(I); + auto *Placeholder = cast(Builder.CreateFreeze( + PoisonValue::get(VectorTy), "promotealloca.placeholder")); + Placeholders.push_back(Placeholder); + Updater.AddAvailableValue(BB, Placeholder); + return Placeholder; + }; + + Value *Result = + promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize, + TransferInfo, GEPVectorIdx, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); - // Then handle deferred loads. - forEachWorkListItem(DeferredLoads, [&](Instruction *I) { - SmallVector NewDLs; - BasicBlock *BB = I->getParent(); - // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always - // get a value, inserting PHIs as needed. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); - if (Result) - Updater.AddAvailableValue(BB, Result); - assert(NewDLs.empty() && "No more deferred loads should be queued!"); - }); + // Now fixup the placeholders. + for (Instruction *Placeholder : Placeholders) { + Placeholder->replaceAllUsesWith( + Updater.GetValueInMiddleOfBlock(Placeholder->getParent())); + Placeholder->eraseFromParent(); + } // Delete all instructions. On the first pass, new dummy loads may have been // added so we need to collect them too. DenseSet InstsToDelete(WorkList.begin(), WorkList.end()); - InstsToDelete.insert_range(DeferredLoads); for (Instruction *I : InstsToDelete) { assert(I->use_empty()); I->eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll index 63622e67e7d0b..7b64d8728cc24 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll @@ -262,14 +262,15 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset(ptr %out) { ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <6 x i64> [[TMP12]], i64 3, i32 3 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 4, i32 4 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 5, i32 5 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[SEL3]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP7]], 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP17]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <3 x i64> [[TMP3]], i64 [[TMP5]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP16]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x i64> [[TMP15]], i32 [[TMP18]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <3 x i64> [[TMP6]], i64 [[TMP8]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP9]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 @@ -311,15 +312,16 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out) ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <6 x i64> [[TMP13]], i64 3, i32 3 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 6 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[SEL3]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP17]], 3 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP8]], 6 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP18]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP19]] ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <3 x i64> [[TMP4]], i64 [[TMP6]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP8]] +; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP18]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <6 x i64> [[TMP16]], i32 [[TMP20]] ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x i64> [[TMP7]], i64 [[TMP9]], i64 2 ; CHECK-NEXT: [[ELEM:%.*]] = extractelement <3 x i64> [[TMP10]], i32 2 ; CHECK-NEXT: store i64 [[ELEM]], ptr [[OUT]], align 8 diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll index a865bf5058d6a..7da441f2e79d2 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll @@ -11,8 +11,10 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1 ; CHECK-NEXT: ret void ; @@ -39,8 +41,10 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[OFFSET:%.*]] to i32 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[OFFSET]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP8]], -1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 [[TMP7]] ; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll index 9fb73963153a2..aaec725f85890 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -passes=sroa,amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s ; GCN-LABEL: {{^}}float4_alloca_store4: ; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4 diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll index 8e4cc2b0236c0..a7090960518af 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll @@ -1,11 +1,11 @@ -; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s ; Show that what the alloca promotion pass will do for non-atomic load/store. ; OPT-LABEL: @vector_alloca_not_atomic( ; -; OPT: extractelement <3 x i32> , i64 %index -define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) { +; OPT: extractelement <3 x i32> , i32 %index +define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -13,7 +13,7 @@ entry: store i32 0, ptr addrspace(5) %alloca store i32 1, ptr addrspace(5) %a1 store i32 2, ptr addrspace(5) %a2 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load i32, ptr addrspace(5) %tmp store i32 %data, ptr addrspace(1) %out ret void @@ -26,7 +26,7 @@ entry: ; OPT: store i32 1, ptr addrspace(5) ; OPT: store i32 2, ptr addrspace(5) ; OPT: load atomic i32, ptr addrspace(5) -define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) { +define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -34,7 +34,7 @@ entry: store i32 0, ptr addrspace(5) %alloca store i32 1, ptr addrspace(5) %a1 store i32 2, ptr addrspace(5) %a2 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4 store i32 %data, ptr addrspace(1) %out ret void @@ -47,7 +47,7 @@ entry: ; OPT: store atomic i32 1, ptr addrspace(5) ; OPT: store atomic i32 2, ptr addrspace(5) ; OPT: load i32, ptr addrspace(5) -define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) { +define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i32 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 @@ -55,7 +55,7 @@ entry: store atomic i32 0, ptr addrspace(5) %alloca release, align 4 store atomic i32 1, ptr addrspace(5) %a1 release, align 4 store atomic i32 2, ptr addrspace(5) %a2 release, align 4 - %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 %index %data = load i32, ptr addrspace(5) %tmp store i32 %data, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 9c05f4d16cb4e..4a29f7e53e93a 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -72,7 +72,8 @@ entry: ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %promotealloca = phi <6 x float> [ zeroinitializer, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %promotealloca, float %tmp71, i32 %tmp10 +; OPT: [[TMP:%tmp7.*]] = load float, ptr addrspace(1) %tmp5, align 4 +; OPT: %0 = insertelement <6 x float> %promotealloca, float [[TMP]], i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> ; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 @@ -132,7 +133,8 @@ bb15: ; preds = %.preheader ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %promotealloca = phi <6 x double> [ zeroinitializer, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %promotealloca, double %tmp71, i32 %tmp10 +; OPT: [[TMP:%tmp7.*]] = load double, ptr addrspace(1) %tmp5, align 8 +; OPT: %0 = insertelement <6 x double> %promotealloca, double [[TMP]], i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> ; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20