AMDGPU: Do not infer implicit inputs for !nocallback intrinsics by arsenm · Pull Request #131759 · llvm/llvm-project

arsenm · 2025-03-18T08:46:46Z

This isn't really the right check, we want to know that the intrinsic
does not perform a true function call to any code (in the module or not). nocallback
appears to be the closest thing to this property we have now though. Fixes theoretically
miscompiles with intrinsics like statepoint, which hide a call to a real function.

Also do the same for inferring no-agpr usage.

arsenm · 2025-03-18T08:47:00Z

This stack of pull requests is managed by Graphite. Learn more about stacking.

llvmbot · 2025-03-18T08:48:06Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

This isn't really the right check, we want to know that the intrinsic
does not perform a true function call to any code (in the module or not). nocallback
appears to be the closest thing to this property we have now though.

Also do the same for inferring no-agpr usage.

Full diff: https://github.com/llvm/llvm-project/pull/131759.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp (+24-5)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll (+31)
(added) llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll (+75)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0cee3c3cb5e92..9d410ae5a55e6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -48,9 +48,10 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  NOT_IMPLICIT_INPUT = 0,
+  UNKNOWN_INTRINSIC = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+  NOT_IMPLICIT_INPUT
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -118,7 +119,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return NOT_IMPLICIT_INPUT;
+    return UNKNOWN_INTRINSIC;
   }
 }
 
@@ -522,6 +523,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
+
+      if (AttrMask == UNKNOWN_INTRINSIC) {
+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it
+        // isn't calling any function that may use implicit arguments regardless
+        // of whether it's internal to the module or not.
+        //
+        // TODO: Ignoring callsite attributes.
+        if (!Callee->hasFnAttribute(Attribute::NoCallback))
+          return indicatePessimisticFixpoint();
+        continue;
+      }
+
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1282,8 +1298,11 @@ struct AAAMDGPUNoAGPR
 
       // Some intrinsics may use AGPRs, but if we have a choice, we are not
       // required to use AGPRs.
-      if (Callee->isIntrinsic())
-        return true;
+      if (Callee->isIntrinsic()) {
+        // Assume !nocallback intrinsics may call a function which requires
+        // AGPRs.
+        return CB.hasFnAttr(Attribute::NoCallback);
+      }
 
       // TODO: Handle callsite attributes
       const auto *CalleeInfo = A.getAAFor<AAAMDGPUNoAGPR>(
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000000000..892bfa12140d4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT:    ret i32 [[CALL1]]
+;
+entry:
+  %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+  ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000000000..b607c6cd8e720
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void (...) @llvm.fake.use(i32 %arg)
+  ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.donothing()
+  ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 %arg)
+  ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.ubsantrap(i8 0)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7:[0-9]+]] = { cold noreturn nounwind "target-cpu"="gfx90a" }
+;.

shiltian · 2025-03-19T13:13:51Z

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it


This needs an iterative approach, similar to other AAs, and then propagate the "attribute".

This isn't really the right check, we want to know that the intrinsic does not perform a true function call to any code (in the module or not). nocallback appears to be the closest thing to this property we have now though. Also do the same for inferring no-agpr usage.

…cs" (#174224) Reverts #131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

…ck intrinsics" (#174224) Reverts llvm/llvm-project#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

…cs" (llvm#174224) Reverts llvm#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

#1016) …cs" (llvm#174224) Reverts llvm#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

…cs" (#174224) Reverts llvm/llvm-project#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

…cs" (#174224) Reverts llvm/llvm-project#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite (cherry picked from commit dff081c)

…ics" (#176081) This reverts #174224 and re-applies #131759 . Note: If #117544 is reverted, this should also be reverted.

…ics" (llvm#176081) This reverts llvm#174224 and re-applies llvm#131759 . Note: If llvm#117544 is reverted, this should also be reverted.

#1016) …cs" (llvm#174224) Reverts llvm#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

…allback (#175230) This adds support to whitelist trap intrinsics while handling of intrinsics with !nocallback. This fixes the reasons behind the previous revert of #131759. The attributor was exiting early whenever it saw intrinsics without the nocallback bit, so trap-only kernels lost all the inferred “no implicit arg” metadata and their amdgpu-agpr-alloc=0 guarantees. That conservative fallback broke certain workloads by forcing unnecessary implicit arguments and AGPR reservations. This patch allows the pass to recognize leaf-like trap intrinsics, so they no longer poison the analysis. --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>

…allback (llvm#175230) This adds support to whitelist trap intrinsics while handling of intrinsics with !nocallback. This fixes the reasons behind the previous revert of llvm#131759. The attributor was exiting early whenever it saw intrinsics without the nocallback bit, so trap-only kernels lost all the inferred “no implicit arg” metadata and their amdgpu-agpr-alloc=0 guarantees. That conservative fallback broke certain workloads by forcing unnecessary implicit arguments and AGPR reservations. This patch allows the pass to recognize leaf-like trap intrinsics, so they no longer poison the analysis. --------- Co-authored-by: Matt Arsenault <arsenm2@gmail.com>

arsenm mentioned this pull request Mar 18, 2025

AMDGPU: Fix broken check prefix and degraded cov4 test coverage #131757

Merged

arsenm mentioned this pull request Mar 18, 2025

AMDGPU: Fix attributor not handling all trap intrinsics #131758

Merged

arsenm added the backend:AMDGPU label Mar 18, 2025 — with Graphite App

arsenm requested review from changpeng, jdoerfert, saiislam and shiltian March 18, 2025 08:47

arsenm changed the title ~~AMDGPU: Add baseline test for attributor with calling intrinsic~~ AMDGPU: Do not infer implicit inputs for !nocallback intrinsics Mar 18, 2025

arsenm requested a review from JonChesterfield March 18, 2025 08:48

arsenm marked this pull request as ready for review March 18, 2025 08:48

arsenm force-pushed the users/arsenm/amdgpu/attributor-fix-skipping-debugtrap-ubsantrap branch from a33a619 to 75e6bf7 Compare March 19, 2025 01:31

arsenm force-pushed the users/arsenm/amdgpu/attributor-no-infer-inputs-intrinsic-missing-nocallback branch from 4c171fb to 4e4d1a1 Compare March 19, 2025 01:32

Base automatically changed from users/arsenm/amdgpu/attributor-fix-skipping-debugtrap-ubsantrap to main March 19, 2025 03:17

arsenm force-pushed the users/arsenm/amdgpu/attributor-no-infer-inputs-intrinsic-missing-nocallback branch from 4e4d1a1 to 6aef6b0 Compare March 19, 2025 03:18

shiltian reviewed Mar 19, 2025

View reviewed changes

shiltian approved these changes Mar 19, 2025

View reviewed changes

arsenm added 2 commits November 4, 2025 20:10

AMDGPU: Add baseline test for attributor with calling intrinsic

1646797

arsenm force-pushed the users/arsenm/amdgpu/attributor-no-infer-inputs-intrinsic-missing-nocallback branch from 6aef6b0 to 85f5383 Compare November 5, 2025 04:12

arsenm enabled auto-merge (squash) November 5, 2025 04:13

arsenm merged commit 849038c into main Nov 5, 2025
7 of 8 checks passed

arsenm deleted the users/arsenm/amdgpu/attributor-no-infer-inputs-intrinsic-missing-nocallback branch November 5, 2025 04:53

ronlieb mentioned this pull request Jan 2, 2026

Revert "AMDGPU: Do not infer implicit inputs for !nocallback intrinsics" #174224

Merged

ronlieb added a commit that referenced this pull request Jan 6, 2026

Revert "AMDGPU: Do not infer implicit inputs for !nocallback intrinsi…

dff081c

…cs" (#174224) Reverts #131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

ronlieb added a commit to ROCm/llvm-project that referenced this pull request Jan 6, 2026

Revert "AMDGPU: Do not infer implicit inputs for !nocallback intrinsi…

34bf9c4

…cs" (llvm#174224) Reverts llvm#131759 seeing regressions in : Pytorch UT- 8 test cases failed in "test_ops" test suite

ronlieb mentioned this pull request Jan 6, 2026

Revert "AMDGPU: Do not infer implicit inputs for !nocallback intrinsi… ROCm/llvm-project#1016

Merged

akadutta mentioned this pull request Jan 9, 2026

AMDGPU: Add support for llvm.trap to handling of intrinsics with !nocallback #175230

Merged

akadutta mentioned this pull request Jan 15, 2026

Reapply "AMDGPU: Do not infer implicit inputs for !nocallback intrinsics" #176081

Merged

akadutta added a commit that referenced this pull request Jan 15, 2026

Reapply "AMDGPU: Do not infer implicit inputs for !nocallback intrins…

fc10fbb

…ics" (#176081) This reverts #174224 and re-applies #131759 . Note: If #117544 is reverted, this should also be reverted.

ronlieb mentioned this pull request Jan 28, 2026

Revert "AMDGPU: Do not infer implicit inputs for !nocallback intrinsi… ROCm/llvm-project#1253

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AMDGPU: Do not infer implicit inputs for !nocallback intrinsics #131759

AMDGPU: Do not infer implicit inputs for !nocallback intrinsics #131759
arsenm merged 2 commits intomainfrom
users/arsenm/amdgpu/attributor-no-infer-inputs-intrinsic-missing-nocallback

arsenm commented Mar 18, 2025 •

edited

Loading

Uh oh!

arsenm commented Mar 18, 2025 •

edited

Loading

Uh oh!

llvmbot commented Mar 18, 2025

Uh oh!

shiltian Mar 19, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

Conversation

arsenm commented Mar 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm commented Mar 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Mar 18, 2025

Uh oh!

shiltian Mar 19, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

arsenm commented Mar 18, 2025 •

edited

Loading

arsenm commented Mar 18, 2025 •

edited

Loading