-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[AMDGPU] Enable i8 GEP promotion for vector allocas #166132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
478545f
b2d58fa
5721cd6
94f1dc6
0e8c3fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -457,10 +457,25 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, | |
| const auto &VarOffset = VarOffsets.front(); | ||
| APInt OffsetQuot; | ||
| APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem); | ||
| if (Rem != 0 || OffsetQuot.isZero()) | ||
| return nullptr; | ||
|
|
||
| Value *Offset = VarOffset.first; | ||
| if (Rem != 0) { | ||
| unsigned ElemSizeShift = Log2_64(VecElemSize); | ||
| SimplifyQuery SQ(DL); | ||
| SQ.CxtI = GEP; | ||
| KnownBits KB = computeKnownBits(VarOffset.first, SQ); | ||
| // Bail out if the index may point into the middle of an element. | ||
| if (KB.countMinTrailingZeros() < ElemSizeShift) | ||
| return nullptr; | ||
|
|
||
| Value *Scaled = Builder.CreateLShr(VarOffset.first, ElemSizeShift); | ||
| if (Instruction *NewInst = dyn_cast<Instruction>(Scaled)) | ||
| NewInsts.push_back(NewInst); | ||
|
|
||
| Offset = Scaled; | ||
| OffsetQuot = APInt(BW, 1); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry look into this very late. This is wrong. The case we want to optimize is when |
||
| Rem = 0; | ||
| } | ||
|
|
||
| if (!isa<IntegerType>(Offset->getType())) | ||
| return nullptr; | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -250,6 +250,150 @@ bb2: | |
| store i32 0, ptr addrspace(5) %extractelement | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_0_or_4( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 0, i32 4 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] | ||
| ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 0, i32 4 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_8( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <3 x float> poison | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[VEC]], float [[DATA]], i32 [[TMP1]] | ||
| ; CHECK-NEXT: store <3 x float> [[TMP2]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 4, i32 8 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be better you change this to |
||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_8( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <8 x float> poison | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x float> [[VEC]], i64 0 | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x float> [[ALLOCA]], float [[TMP1]], i32 0 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x float> [[VEC]], i64 1 | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP2]], float [[TMP3]], i32 1 | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x float> [[VEC]], i64 2 | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP4]], float [[TMP5]], i32 2 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 8 | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = lshr i32 [[INDEX]], 2 | ||
| ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP6]], float [[DATA]], i32 [[TMP7]] | ||
| ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x float> [[TMP8]], i32 0 | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <3 x float> poison, float [[TMP9]], i64 0 | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x float> [[TMP8]], i32 1 | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <3 x float> [[TMP10]], float [[TMP11]], i64 1 | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x float> [[TMP8]], i32 2 | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <3 x float> [[TMP12]], float [[TMP13]], i64 2 | ||
| ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca [2 x <3 x float>], align 16, addrspace(5) | ||
| %row = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) %alloca, i32 0, i32 0 | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %row, align 16 | ||
| %index = select i1 %idx_sel, i32 4, i32 8 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %row, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %row, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe switch to another calling convention so that the |
||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_vector_gep_i8_4_or_5_no_promote( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[TMP1:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() | ||
| ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 1 | ||
| ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP2]], align 4, !invariant.load [[META0]] | ||
| ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP1]], i64 2 | ||
| ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[TMP4]], align 4, !range [[RNG1]], !invariant.load [[META0]] | ||
| ; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 | ||
| ; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() | ||
| ; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y() | ||
| ; CHECK-NEXT: [[TMP9:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z() | ||
| ; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] | ||
| ; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] | ||
| ; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] | ||
| ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x <3 x float>], ptr addrspace(3) @scalar_alloca_vector_gep_i8_4_or_5_no_promote.alloca, i32 0, i32 [[TMP14]] | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(3) [[TMP15]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5 | ||
| ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) [[TMP15]], i32 [[INDEX]] | ||
| ; CHECK-NEXT: store float [[DATA]], ptr addrspace(3) [[ELT]], align 4 | ||
| ; CHECK-NEXT: [[UPDATED:%.*]] = load <3 x float>, ptr addrspace(3) [[TMP15]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[UPDATED]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca <3 x float>, align 16, addrspace(5) | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %alloca | ||
| %index = select i1 %idx_sel, i32 4, i32 5 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %alloca, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %alloca, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote(ptr %buffer, float %data, i1 %idx_sel) { | ||
| ; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_nested_vector_gep_i8_4_or_5_no_promote( | ||
| ; CHECK-SAME: ptr [[BUFFER:%.*]], float [[DATA:%.*]], i1 [[IDX_SEL:%.*]]) { | ||
| ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [2 x <3 x float>], align 16, addrspace(5) | ||
| ; CHECK-NEXT: [[ROW:%.*]] = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) [[ALLOCA]], i32 0, i32 0 | ||
| ; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[VEC]], ptr addrspace(5) [[ROW]], align 16 | ||
| ; CHECK-NEXT: [[INDEX:%.*]] = select i1 [[IDX_SEL]], i32 4, i32 5 | ||
| ; CHECK-NEXT: [[ELT:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) [[ROW]], i32 [[INDEX]] | ||
| ; CHECK-NEXT: store float [[DATA]], ptr addrspace(5) [[ELT]], align 4 | ||
| ; CHECK-NEXT: [[TMP14:%.*]] = load <3 x float>, ptr addrspace(5) [[ROW]], align 16 | ||
| ; CHECK-NEXT: store <3 x float> [[TMP14]], ptr [[BUFFER]], align 16 | ||
| ; CHECK-NEXT: ret void | ||
| ; | ||
| %alloca = alloca [2 x <3 x float>], align 16, addrspace(5) | ||
| %row = getelementptr inbounds [2 x <3 x float>], ptr addrspace(5) %alloca, i32 0, i32 0 | ||
| %vec = load <3 x float>, ptr %buffer | ||
| store <3 x float> %vec, ptr addrspace(5) %row, align 16 | ||
| %index = select i1 %idx_sel, i32 4, i32 5 | ||
| %elt = getelementptr inbounds nuw i8, ptr addrspace(5) %row, i32 %index | ||
| store float %data, ptr addrspace(5) %elt, align 4 | ||
| %updated = load <3 x float>, ptr addrspace(5) %row, align 16 | ||
| store <3 x float> %updated, ptr %buffer, align 16 | ||
| ret void | ||
| } | ||
|
|
||
| ;. | ||
| ; CHECK: [[META0]] = !{} | ||
| ; CHECK: [[RNG1]] = !{i32 0, i32 1025} | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to validate VecElemSize is a power of 2?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, but I think it is not necessary to explicitly check whether the element size is a power of two, because it is already covered by the existing check here:
llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
Lines 871 to 877 in 52fdcf9
If the element type is not naturally aligned, it will return false, which also rejects non power of 2 element sizes, such as i24.