-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AMDGPU] Generate canonical additions in AMDGPUPromoteAlloca #157810
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Generate canonical additions in AMDGPUPromoteAlloca #157810
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) ChangesWhen we know that one operand of an addition is a constant, we might was Full diff: https://github.com/llvm/llvm-project/pull/157810.diff 4 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index bb77cdff778c0..7dbe1235a98b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -478,7 +478,7 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
ConstantInt *ConstIndex =
ConstantInt::get(OffsetType, IndexQuot.getSExtValue());
- Value *IndexAdd = Builder.CreateAdd(ConstIndex, Offset);
+ Value *IndexAdd = Builder.CreateAdd(Offset, ConstIndex);
if (Instruction *NewInst = dyn_cast<Instruction>(IndexAdd))
NewInsts.push_back(NewInst);
return IndexAdd;
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
index d72f158763c61..63622e67e7d0b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-multidim.ll
@@ -312,7 +312,7 @@ define amdgpu_kernel void @i64_2d_load_store_subvec_3_i64_offset_index(ptr %out)
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <6 x i64> [[TMP14]], i64 4, i32 4
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <6 x i64> [[TMP15]], i64 5, i32 5
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[SEL3]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 6, [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 6
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <6 x i64> [[TMP16]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i64> poison, i64 [[TMP3]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP2]], 1
@@ -464,7 +464,7 @@ define amdgpu_kernel void @i16_2d_load_store(ptr %out, i32 %sel) {
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x i16> [[TMP3]], i16 3, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x i16> [[TMP4]], i16 4, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x i16> [[TMP5]], i16 5, i32 5
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SEL]], 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i16> [[TMP6]], i32 [[TMP1]]
; CHECK-NEXT: store i16 [[TMP2]], ptr [[OUT]], align 2
; CHECK-NEXT: ret void
@@ -498,7 +498,7 @@ define amdgpu_kernel void @float_2d_load_store(ptr %out, i32 %sel) {
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x float> [[TMP3]], float 3.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x float> [[TMP4]], float 4.000000e+00, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x float> [[TMP5]], float 5.000000e+00, i32 5
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SEL]], 3
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> [[TMP6]], i32 [[TMP1]]
; CHECK-NEXT: store float [[TMP2]], ptr [[OUT]], align 4
; CHECK-NEXT: ret void
@@ -538,7 +538,7 @@ define amdgpu_kernel void @ptr_2d_load_store(ptr %out, i32 %sel) {
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <6 x ptr> [[TMP3]], ptr [[PTR_3]], i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <6 x ptr> [[TMP4]], ptr [[PTR_4]], i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <6 x ptr> [[TMP5]], ptr [[PTR_5]], i32 5
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 3, [[SEL]]
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[SEL]], 3
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x ptr> [[TMP6]], i32 [[TMP7]]
; CHECK-NEXT: store ptr [[TMP8]], ptr [[OUT]], align 8
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
index 1b6ac0bd93c19..a865bf5058d6a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-negative-index.ll
@@ -11,7 +11,7 @@ define amdgpu_kernel void @negative_index_byte(ptr %out, i64 %offset) {
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 1, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 2, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 3, i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8> [[TMP4]], i64 [[TMP5]]
; CHECK-NEXT: store i8 [[TMP6]], ptr [[OUT:%.*]], align 1
; CHECK-NEXT: ret void
@@ -39,7 +39,7 @@ define amdgpu_kernel void @negative_index_word(ptr %out, i64 %offset) {
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 1, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 2, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 3, i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 -1, [[OFFSET:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET:%.*]], -1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i64 [[TMP5]]
; CHECK-NEXT: store i32 [[TMP6]], ptr [[OUT:%.*]], align 4
; CHECK-NEXT: ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
index a24f041a17857..f95a6a8ec9b45 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-gep-of-gep.ll
@@ -10,7 +10,7 @@ define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep(i32 %idx, ptr ad
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <20 x i32> [[TMP1]], i32 2, i32 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = mul i32 [[IDX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 1, [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <20 x i32> [[TMP3]], i32 [[TMP5]]
; CHECK-NEXT: store i32 [[TMP6]], ptr addrspace(1) [[OUTPUT]], align 4
; CHECK-NEXT: ret void
@@ -31,12 +31,12 @@ define amdgpu_ps void @scalar_alloca_ptr_with_vector_gep_of_gep3(i32 %idx, ptr a
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <16 x i32> poison
; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[IDX]], 2
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 8, [[TMP0]]
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 8
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[ALLOCA]], i32 10, i32 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP2]], i32 20, i32 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[IDX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 9, [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 9
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP4]], i32 [[TMP6]]
; CHECK-NEXT: store i32 [[TMP7]], ptr addrspace(1) [[OUTPUT]], align 4
; CHECK-NEXT: ret void
|
When we know that one operand of an addition is a constant, we might was well put it on the right-hand side and avoid the work to canonicalize it in a later pass.
f6a8f01
to
43fc13b
Compare
Manual rebase, since Graphite seems to have messed the automatic rebase up. |
Merge activity
|
When we know that one operand of an addition is a constant, we might was
well put it on the right-hand side and avoid the work to canonicalize it
in a later pass.