-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Fix image intrinsic optimizer on loads from different resources #69355
Conversation
The image intrinsic optimizer pass was neglecting to check any arguments of the load intrinsic after the VAddr arguments. For example multiple loads from different resources should not have been combined but were, because the pass was not checking the resource argument.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesThe image intrinsic optimizer pass was neglecting to check any arguments Full diff: https://github.com/llvm/llvm-project/pull/69355.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index acfd3407681a7fd..d1c0a1703502517 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -108,28 +108,29 @@ void addInstToMergeableList(
if (IIList.front()->getType() != II->getType())
continue;
- // Check DMask.
- Value *DMaskList = IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex);
- Value *DMask = II->getArgOperand(ImageDimIntr->DMaskIndex);
- if (DMaskList != DMask)
- continue;
-
- // Check VAddr (except FragId).
- int I = ImageDimIntr->VAddrStart;
- for (; I < ImageDimIntr->VAddrEnd - 1; ++I) {
- if (IIList.front()->getArgOperand(I) != II->getArgOperand(I))
- break;
+ // Check all arguments (DMask, VAddr, RSrc etc).
+ bool AllEqual = true;
+ assert(IIList.front()->arg_size() == II->arg_size());
+ for (int I = 1, E = II->arg_size(); I != E; ++I) {
+ Value *ArgList = IIList.front()->getArgOperand(I);
+ Value *Arg = II->getArgOperand(I);
+ if (I == ImageDimIntr->VAddrEnd - 1) {
+ // Check FragId group.
+ auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
+ auto FragId = cast<ConstantInt>(II->getArgOperand(I));
+ if (FragIdList->getValue().udiv(4) != FragId->getValue().udiv(4)) {
+ AllEqual = false;
+ break;
+ }
+ } else {
+ // Check all arguments except FragId.
+ if (ArgList != Arg) {
+ AllEqual = false;
+ break;
+ }
+ }
}
-
- if (I != ImageDimIntr->VAddrEnd - 1)
- continue;
-
- // Check FragId group.
- const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
- Value *FragIdList = IIList.front()->getArgOperand(FragIdIndex);
- auto IIListFragId = cast<ConstantInt>(FragIdList);
- auto IIFragId = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
- if (IIListFragId->getValue().udiv(4) != IIFragId->getValue().udiv(4))
+ if (!AllEqual)
continue;
// Add to the list.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
index 853ca53767be8cc..5ffdbb0f8c5b073 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.2dmsaa.ll
@@ -1184,6 +1184,47 @@ merge:
ret [4 x float] %i25
}
+define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(<8 x i32> inreg %rsrc1, <8 x i32> inreg %rsrc2, i32 %s, i32 %t) {
+; NO-MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; NO-MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; NO-MSAA-NEXT: main_body:
+; NO-MSAA-NEXT: [[I:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I1:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I2:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I3:%.*]] = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 1, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; NO-MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; NO-MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; NO-MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; NO-MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; NO-MSAA-NEXT: ret [4 x float] [[I7]]
+;
+; MSAA-LABEL: define amdgpu_ps [4 x float] @load_2dmsaa_v4f32_dmask1_different_rsrc(
+; MSAA-SAME: <8 x i32> inreg [[RSRC1:%.*]], <8 x i32> inreg [[RSRC2:%.*]], i32 [[S:%.*]], i32 [[T:%.*]]) #[[ATTR0]] {
+; MSAA-NEXT: main_body:
+; MSAA-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC1]], i32 0, i32 0)
+; MSAA-NEXT: [[I:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; MSAA-NEXT: [[I1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; MSAA-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 [[S]], i32 [[T]], i32 0, <8 x i32> [[RSRC2]], i32 0, i32 0)
+; MSAA-NEXT: [[I2:%.*]] = extractelement <4 x float> [[TMP1]], i64 0
+; MSAA-NEXT: [[I3:%.*]] = extractelement <4 x float> [[TMP1]], i64 1
+; MSAA-NEXT: [[I4:%.*]] = insertvalue [4 x float] undef, float [[I]], 0
+; MSAA-NEXT: [[I5:%.*]] = insertvalue [4 x float] [[I4]], float [[I1]], 1
+; MSAA-NEXT: [[I6:%.*]] = insertvalue [4 x float] [[I5]], float [[I2]], 2
+; MSAA-NEXT: [[I7:%.*]] = insertvalue [4 x float] [[I6]], float [[I3]], 3
+; MSAA-NEXT: ret [4 x float] [[I7]]
+;
+main_body:
+ %i = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc1, i32 0, i32 0)
+ %i1 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc1, i32 0, i32 0)
+ %i2 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc2, i32 0, i32 0)
+ %i3 = call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc2, i32 0, i32 0)
+ %i4 = insertvalue [4 x float] undef, float %i, 0
+ %i5 = insertvalue [4 x float] %i4, float %i1, 1
+ %i6 = insertvalue [4 x float] %i5, float %i2, 2
+ %i7 = insertvalue [4 x float] %i6, float %i3, 3
+ ret [4 x float] %i7
+}
+
declare float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
declare <2 x float> @llvm.amdgcn.image.load.2dmsaa.v2f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
declare <3 x float> @llvm.amdgcn.image.load.2dmsaa.v3f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #0
|
// Check all arguments (DMask, VAddr, RSrc etc). | ||
bool AllEqual = true; | ||
assert(IIList.front()->arg_size() == II->arg_size()); | ||
for (int I = 1, E = II->arg_size(); I != E; ++I) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just stylistic, so feel free to ignore me, but if AllEqual
was part of the for-loop precondition, then you could just assign it comparison results in the loop and remove the if (...) {AllEqual = true; break}
patterns.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
I'm attempting to test the fix now.
Confirmed this fixes the issue we were seeing. |
The image intrinsic optimizer pass was neglecting to check any arguments
of the load intrinsic after the VAddr arguments. For example multiple
loads from different resources should not have been combined but were,
because the pass was not checking the resource argument.