[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

jhuber6 · 2024-07-23T14:38:54Z

Summary:
The NVPTX backend optimizes the ABI for functions that are internal,
however, this is not legal for indirect call prototypes. Previously, we
would modify the ABI on an aggregate byval type passed to an indirect
call prototype, which would make PTXAS error. This patch just passes the
function as a nullptr to force strict ABI compliance without
modification in the helper function.

Fixes #100055

llvmbot · 2024-07-23T14:39:26Z

@llvm/pr-subscribers-libc

@llvm/pr-subscribers-backend-nvptx

Author: Joseph Huber (jhuber6)

Changes

Summary:
The NVPTX backend optimizes the ABI for functions that are internal,
however, this is not legal for indirect call prototypes. Previously, we
would modify the ABI on an aggregate byval type passed to an indirect
call prototype, which would make PTXAS error. This patch just passes the
function as a nullptr to force strict ABI compliance without
modification in the helper function.

Fixes #100055

Full diff: https://github.com/llvm/llvm-project/pull/100131.diff

3 Files Affected:

(modified) libc/config/gpu/entrypoints.txt (+4-11)
(modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-2)
(added) llvm/test/CodeGen/NVPTX/indirect_byval.ll (+94)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 42909cec55890..fa878d8999227 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -1,13 +1,3 @@
-if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  set(extra_entrypoints
-      # stdio.h entrypoints
-      libc.src.stdio.snprintf
-      libc.src.stdio.sprintf
-      libc.src.stdio.vsnprintf
-      libc.src.stdio.vsprintf
-  )
-endif()
-
 set(TARGET_LIBC_ENTRYPOINTS
     # assert.h entrypoints
     libc.src.assert.__assert_fail
@@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.errno
 
     # stdio.h entrypoints
-    ${extra_entrypoints}
     libc.src.stdio.clearerr
     libc.src.stdio.fclose
     libc.src.stdio.printf
     libc.src.stdio.vprintf
     libc.src.stdio.fprintf
     libc.src.stdio.vfprintf
+    libc.src.stdio.snprintf
+    libc.src.stdio.sprintf
+    libc.src.stdio.vsnprintf
+    libc.src.stdio.vsprintf
     libc.src.stdio.feof
     libc.src.stdio.ferror
     libc.src.stdio.fflush
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 44c1a2e50486c..bcd2ee2f4fc1e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(
 
   bool first = true;
 
-  const Function *F = CB.getFunction();
   unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
   for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
     Type *Ty = Args[i].Ty;
@@ -1471,10 +1470,11 @@ std::string NVPTXTargetLowering::getPrototype(
       continue;
     }
 
+    // Indirect calls need strict ABI alignment so we disable optimizations.
     Type *ETy = Args[i].IndirectType;
     Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
     Align ParamByValAlign =
-        getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+        getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);
 
     O << ".param .align " << ParamByValAlign.value() << " .b8 ";
     O << "_";
diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
new file mode 100644
index 0000000000000..ac6c4e262fd60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
+
+%struct.S = type { i8 }
+%struct.U = type { i64 }
+
+@ptr = external global ptr, align 8
+
+define internal i32 @foo() {
+; CHECK-LABEL: foo(
+; CHECK:       {
+; CHECK-NEXT:    .local .align 1 .b8 __local_depot0[2];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot0;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u8 %rs1, [%SP+1];
+; CHECK-NEXT:    add.u64 %rd2, %SP, 0;
+; CHECK-NEXT:    { // callseq 0, 0
+; CHECK-NEXT:    .param .align 1 .b8 param0[1];
+; CHECK-NEXT:    st.param.b8 [param0+0], %rs1;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd2;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_0;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 0
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.S, align 1
+  %agg.tmp = alloca %struct.S, align 1
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
+  ret i32 %call
+}
+
+define internal i32 @bar() {
+; CHECK-LABEL: bar(
+; CHECK:         // @bar
+; CHECK-NEXT:  {
+; CHECK-NEXT:    .local .align 8 .b8 __local_depot1[16];
+; CHECK-NEXT:    .reg .b64 %SP;
+; CHECK-NEXT:    .reg .b64 %SPL;
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    mov.u64 %SPL, __local_depot1;
+; CHECK-NEXT:    cvta.local.u64 %SP, %SPL;
+; CHECK-NEXT:    ld.global.u64 %rd1, [ptr];
+; CHECK-NEXT:    ld.u64 %rd2, [%SP+8];
+; CHECK-NEXT:    add.u64 %rd3, %SP, 0;
+; CHECK-NEXT:    { // callseq 1, 0
+; CHECK-NEXT:    .param .align 8 .b8 param0[8];
+; CHECK-NEXT:    st.param.b64 [param0+0], %rd2;
+; CHECK-NEXT:    .param .b64 param1;
+; CHECK-NEXT:    st.param.b64 [param1+0], %rd3;
+; CHECK-NEXT:    .param .b32 retval0;
+; CHECK-NEXT:    prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
+; CHECK-NEXT:    call (retval0),
+; CHECK-NEXT:    %rd1,
+; CHECK-NEXT:    (
+; CHECK-NEXT:    param0,
+; CHECK-NEXT:    param1
+; CHECK-NEXT:    )
+; CHECK-NEXT:    , prototype_1;
+; CHECK-NEXT:    ld.param.b32 %r1, [retval0+0];
+; CHECK-NEXT:    } // callseq 1
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
+entry:
+  %s = alloca %struct.U, align 8
+  %agg.tmp = alloca %struct.U, align 8
+  %0 = load ptr, ptr @ptr, align 8
+  %call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
+  ret i32 %call
+}

jhuber6 · 2024-07-23T14:40:34Z

The libc changes undo a hack I had to do to make the tests pass. I put them in this patch so they can be backported.

Artem-B

LGTM in principle, with a nit.

Artem-B · 2024-07-23T17:26:38Z

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

    Type *ETy = Args[i].IndirectType;
    Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
    Align ParamByValAlign =
-        getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
+        getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);


The comment above needs to mantion that it's the nullptr here that forces strict ABI-compliant alignment. Otherwise it's not clear which part of the code controls this.

Ideally, it would be even nicer to make "ABI-compliant alignment" a parameter.

This doesn't really make sense to me as a fix. Indirect calls shouldn't require special handling vs. direct

The NVPTX backend thinks that internal functions can have their ABI modified, so they up-scale alignment on things. This doesn't work for these indirect calls, which seem to strictly require the ABI.

Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055

…vm#100131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes llvm#100055 (cherry picked from commit e0649a5)

…00131) Summary: The NVPTX backend optimizes the ABI for functions that are internal, however, this is not legal for indirect call prototypes. Previously, we would modify the ABI on an aggregate byval type passed to an indirect call prototype, which would make PTXAS error. This patch just passes the function as a nullptr to force strict ABI compliance without modification in the helper function. Fixes #100055 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251110

jhuber6 requested review from arsenm, Artem-B, jlebar and JonChesterfield July 23, 2024 14:38

llvmbot added libc backend:NVPTX labels Jul 23, 2024

Artem-B approved these changes Jul 23, 2024

View reviewed changes

jhuber6 force-pushed the FixNVPTXCallproto branch from cbfdca3 to 1a779ff Compare July 23, 2024 17:40

jhuber6 merged commit e0649a5 into llvm:main Jul 23, 2024
4 of 5 checks passed

philnik777 mentioned this pull request Aug 2, 2024

[Clang] Add a release note deprecating __is_nullptr #101638

Merged

mgabka mentioned this pull request Aug 22, 2024

Add release note about ABI mgabka/llvm-project#6

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

jhuber6 commented Jul 23, 2024

llvmbot commented Jul 23, 2024 •

edited

Loading

jhuber6 commented Jul 23, 2024

Artem-B left a comment

Artem-B Jul 23, 2024

arsenm Jul 23, 2024

jhuber6 Jul 23, 2024

[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

[NVPTX] Fix internal indirect call prototypes not obeying the ABI #100131

Conversation

jhuber6 commented Jul 23, 2024

llvmbot commented Jul 23, 2024 • edited Loading

jhuber6 commented Jul 23, 2024

Artem-B left a comment

Choose a reason for hiding this comment

Artem-B Jul 23, 2024

Choose a reason for hiding this comment

arsenm Jul 23, 2024

Choose a reason for hiding this comment

jhuber6 Jul 23, 2024

Choose a reason for hiding this comment

llvmbot commented Jul 23, 2024 •

edited

Loading