Skip to content
Closed
4 changes: 4 additions & 0 deletions clang/include/clang/Basic/BuiltinsNVPTX.def
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "")
BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "")
BUILTIN(__nvvm_cos_approx_f, "ff", "")

// Tanh

TARGET_BUILTIN(__nvvm_tanh_approx_f, "ff", "", AND(SM_80,PTX70))

// Fma

BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
Expand Down
3 changes: 3 additions & 0 deletions clang/include/clang/Basic/TargetOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ class TargetOptions {
/// \brief If enabled, use precise square root
bool NVVMCudaPrecSqrt = false;

/// \brief If enabled, use approximate tanh
bool NVVMCudaApproxTanhf = false;

/// \brief If enabled, allow AMDGPU unsafe floating point atomics.
bool AllowAMDGPUUnsafeFPAtomics = false;

Expand Down
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,10 @@ defm cuda_prec_sqrt : BoolFOption<"cuda-prec-sqrt",
PosFlag<SetTrue, [CC1Option], "Specify">,
NegFlag<SetFalse, [], "Don't specify">,
BothFlags<[], " that sqrt is correctly rounded (for CUDA devices)">>;
defm nvvm_cuda_approx_tanhf : BoolFOption<"cuda-approx-tanhf",
TargetOpts<"NVVMCudaApproxTanhf">, DefaultFalse,
PosFlag<SetTrue, [CC1Option], "Use the built-in fast approximation of tanhf function. Device needs to have a compute capability >= 8.0">,
NegFlag<SetFalse>>;
def rocm_path_EQ : Joined<["--"], "rocm-path=">, Group<i_Group>,
HelpText<"ROCm installation path, used for finding and automatically linking required bitcode libraries.">;
def hip_path_EQ : Joined<["--"], "hip-path=">, Group<i_Group>,
Expand Down
3 changes: 3 additions & 0 deletions clang/lib/CodeGen/CodeGenModule.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -780,6 +780,9 @@ void CodeGenModule::Release() {
llvm::DenormalMode::IEEE);
getModule().addModuleFlag(llvm::Module::Override, "nvvm-reflect-prec-sqrt",
getTarget().getTargetOpts().NVVMCudaPrecSqrt);
getModule().addModuleFlag(llvm::Module::Override,
"nvvm-reflect-approx-tanhf",
getTarget().getTargetOpts().NVVMCudaApproxTanhf);
}

if (LangOpts.EHAsynch)
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenCUDA/flush-denormals.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ extern "C" __device__ void foo() {}
// FTZ: attributes #0 = {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// NOFTZ-NOT: "denormal-fp-math-f32"

// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}

// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}}
// PTXNOFTZ:!llvm.module.flags = !{{{.*}}, [[MODFLAG:![0-9]+]], {{.*}}, {{.*}}}
// PTXNOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
11 changes: 11 additions & 0 deletions clang/test/CodeGenCUDA/nvvm-reflect-approx-tanh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm -fcuda-approx-tanhf %s -o -| FileCheck --check-prefix=CHECK-ON %s
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx64-nvidia-cuda -emit-llvm %s -o -| FileCheck --check-prefix=CHECK-OFF %s

#include "Inputs/cuda.h"

// Check that the -fcuda-approx-tanhf flag correctly sets the nvvm-reflect module flags.

extern "C" __device__ void foo() {}

// CHECK-ON: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 1}
// CHECK-OFF: !{i32 4, !"nvvm-reflect-approx-tanhf", i32 0}
12 changes: 11 additions & 1 deletion libclc/ptx-nvidiacl/libspirv/math/tanh.cl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@
#include "../../include/libdevice.h"
#include <clcmacro.h>

int __clc_nvvm_reflect_arch();
int __clc_nvvm_reflect_approx_tanh();

float __select_tanhf (float x) {
if (__clc_nvvm_reflect_approx_tanh() && __clc_nvvm_reflect_arch() >= 800) {
return __nvvm_tanh_approx_f(x);
}
return __nv_tanhf(x);
}

#define __CLC_FUNCTION __spirv_ocl_tanh
#define __CLC_BUILTIN __nv_tanh
#define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
#define __CLC_BUILTIN_F __select_tanhf
#include <math/unary_builtin.inc>
7 changes: 7 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/reflect.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,10 @@ define i32 @__clc_nvvm_reflect_arch() alwaysinline {
%reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([12 x i8], [12 x i8] addrspace(1)* @str, i32 0, i32 0) to i8*))
ret i32 %reflect
}

@str_approx_tanh = private addrspace(1) constant [20 x i8] c"__CUDA_APPROX_TANHF\00"

define i32 @__clc_nvvm_reflect_approx_tanh() alwaysinline {
%reflect = call i32 @__nvvm_reflect(i8* addrspacecast (i8 addrspace(1)* getelementptr inbounds ([20 x i8], [20 x i8] addrspace(1)* @str_approx_tanh, i32 0, i32 0) to i8*))
ret i32 %reflect
}
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,13 @@ let TargetPrefix = "nvvm" in {
def int_nvvm_cos_approx_f : GCCBuiltin<"__nvvm_cos_approx_f">,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;

//
// Tanh
//

def int_nvvm_tanh_approx_f : GCCBuiltin<"__nvvm_tanh_approx_f">,
DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;

//
// Fma
//
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,13 @@ def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;",
def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;",
Float32Regs, Float32Regs, int_nvvm_cos_approx_f>;

//
// Tanh
//

def INT_NVVM_TANH_APPROX_F : F_MATH_1<"tanh.approx.f32 \t$dst, $src0;",
Float32Regs, Float32Regs, int_nvvm_tanh_approx_f>;

//
// Fma
//
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/NVPTX/NVVMReflect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,12 @@ static bool runNVVMReflect(Function &F, unsigned SmVersion) {
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-prec-sqrt")))
ReflectVal = Flag->getSExtValue();
} else if (ReflectArg == "__CUDA_APPROX_TANHF") {
// Try to pull __CUDA_APPROX_TANHF from the nvvm-reflect-approx-tanhf
// module flag.
if (auto *Flag = mdconst::extract_or_null<ConstantInt>(
F.getParent()->getModuleFlag("nvvm-reflect-approx-tanhf")))
ReflectVal = Flag->getSExtValue();
}
Call->replaceAllUsesWith(ConstantInt::get(Call->getType(), ReflectVal));
ToRemove.push_back(Call);
Expand Down