Skip to content

Commit

Permalink
[NVPTX] Fix internal indirect call prototypes not obeying the ABI (ll…
Browse files Browse the repository at this point in the history
…vm#100131)

Summary:
The NVPTX backend optimizes the ABI for functions that are internal,
however, this is not legal for indirect call prototypes. Previously, we
would modify the ABI on an aggregate byval type passed to an indirect
call prototype, which would make PTXAS error. This patch just passes the
function as a nullptr to force strict ABI compliance without
modification in the helper function.

Fixes llvm#100055

(cherry picked from commit e0649a5)
  • Loading branch information
jhuber6 authored and tru committed Jul 24, 2024
1 parent aa425eb commit dcc22f9
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 13 deletions.
15 changes: 4 additions & 11 deletions libc/config/gpu/entrypoints.txt
Original file line number Diff line number Diff line change
@@ -1,13 +1,3 @@
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
set(extra_entrypoints
# stdio.h entrypoints
libc.src.stdio.snprintf
libc.src.stdio.sprintf
libc.src.stdio.vsnprintf
libc.src.stdio.vsprintf
)
endif()

set(TARGET_LIBC_ENTRYPOINTS
# assert.h entrypoints
libc.src.assert.__assert_fail
Expand Down Expand Up @@ -186,13 +176,16 @@ set(TARGET_LIBC_ENTRYPOINTS
libc.src.errno.errno

# stdio.h entrypoints
${extra_entrypoints}
libc.src.stdio.clearerr
libc.src.stdio.fclose
libc.src.stdio.printf
libc.src.stdio.vprintf
libc.src.stdio.fprintf
libc.src.stdio.vfprintf
libc.src.stdio.snprintf
libc.src.stdio.sprintf
libc.src.stdio.vsnprintf
libc.src.stdio.vsprintf
libc.src.stdio.feof
libc.src.stdio.ferror
libc.src.stdio.fflush
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,6 @@ std::string NVPTXTargetLowering::getPrototype(

bool first = true;

const Function *F = CB.getFunction();
unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
Type *Ty = Args[i].Ty;
Expand Down Expand Up @@ -1471,10 +1470,12 @@ std::string NVPTXTargetLowering::getPrototype(
continue;
}

// Indirect calls need strict ABI alignment so we disable optimizations by
// not providing a function to optimize.
Type *ETy = Args[i].IndirectType;
Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
Align ParamByValAlign =
getFunctionByValParamAlign(F, ETy, InitialAlign, DL);
getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign, DL);

O << ".param .align " << ParamByValAlign.value() << " .b8 ";
O << "_";
Expand Down
94 changes: 94 additions & 0 deletions llvm/test/CodeGen/NVPTX/indirect_byval.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}

target triple = "nvptx64-nvidia-cuda"

%struct.S = type { i8 }
%struct.U = type { i64 }

@ptr = external global ptr, align 8

define internal i32 @foo() {
; CHECK-LABEL: foo(
; CHECK: {
; CHECK-NEXT: .local .align 1 .b8 __local_depot0[2];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.u64 %SPL, __local_depot0;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
; CHECK-NEXT: ld.u8 %rs1, [%SP+1];
; CHECK-NEXT: add.u64 %rd2, %SP, 0;
; CHECK-NEXT: { // callseq 0, 0
; CHECK-NEXT: .param .align 1 .b8 param0[1];
; CHECK-NEXT: st.param.b8 [param0+0], %rs1;
; CHECK-NEXT: .param .b64 param1;
; CHECK-NEXT: st.param.b64 [param1+0], %rd2;
; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .align 1 .b8 _[1], .param .b64 _);
; CHECK-NEXT: call (retval0),
; CHECK-NEXT: %rd1,
; CHECK-NEXT: (
; CHECK-NEXT: param0,
; CHECK-NEXT: param1
; CHECK-NEXT: )
; CHECK-NEXT: , prototype_0;
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
; CHECK-NEXT: } // callseq 0
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: ret;
entry:
%s = alloca %struct.S, align 1
%agg.tmp = alloca %struct.S, align 1
%0 = load ptr, ptr @ptr, align 8
%call = call i32 %0(ptr byval(%struct.S) align 1 %agg.tmp, ptr noundef %s)
ret i32 %call
}

define internal i32 @bar() {
; CHECK-LABEL: bar(
; CHECK: // @bar
; CHECK-NEXT: {
; CHECK-NEXT: .local .align 8 .b8 __local_depot1[16];
; CHECK-NEXT: .reg .b64 %SP;
; CHECK-NEXT: .reg .b64 %SPL;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: mov.u64 %SPL, __local_depot1;
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
; CHECK-NEXT: ld.u64 %rd2, [%SP+8];
; CHECK-NEXT: add.u64 %rd3, %SP, 0;
; CHECK-NEXT: { // callseq 1, 0
; CHECK-NEXT: .param .align 8 .b8 param0[8];
; CHECK-NEXT: st.param.b64 [param0+0], %rd2;
; CHECK-NEXT: .param .b64 param1;
; CHECK-NEXT: st.param.b64 [param1+0], %rd3;
; CHECK-NEXT: .param .b32 retval0;
; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .align 8 .b8 _[8], .param .b64 _);
; CHECK-NEXT: call (retval0),
; CHECK-NEXT: %rd1,
; CHECK-NEXT: (
; CHECK-NEXT: param0,
; CHECK-NEXT: param1
; CHECK-NEXT: )
; CHECK-NEXT: , prototype_1;
; CHECK-NEXT: ld.param.b32 %r1, [retval0+0];
; CHECK-NEXT: } // callseq 1
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: ret;
entry:
%s = alloca %struct.U, align 8
%agg.tmp = alloca %struct.U, align 8
%0 = load ptr, ptr @ptr, align 8
%call = call noundef i32 %0(ptr byval(%struct.U) align 8 %agg.tmp, ptr %s)
ret i32 %call
}

0 comments on commit dcc22f9

Please sign in to comment.