From 52238e1d473c09d7fcff5e972a199910dc2f15ed Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Thu, 30 Jan 2025 19:15:30 +0100 Subject: [PATCH] [SYCL][NVPTX] Set default fdiv and sqrt for llvm.fpbuiltin (#16714) AltMathLibrary is lacking implementation for llvm.fpbuiltin intrinsics for NVPTX target. This patch adds type-dependent mapping for llvm.fpbuiltin.fdiv with max-error > 2.0 and llvm.fpbuiltin.sqrt with max-error > 1.0 on nvvm intrinsics: fp32 scalar @llvm.fpbuiltin.fdiv -> @llvm.nvvm.div.approx.f fp32 scalar @llvm.fpbuiltin.sqrt -> @llvm.nvvm.sqrt.approx.f vector or non-fp32 scalar llvm.fpbuiltin.fdiv -> fdiv vector or non-fp32 scalar llvm.fpbuiltin.sqrt -> llvm.sqrt Additionally it maps max-error=0.5 fpbuiltin.fadd, fpbuiltin.fsub. fpbuiltin.fmul, fpbuiltin.fdiv, fpbuiltin.frem, fpbuiltin.sqrt and fpbuiltin.ldexp intrinsic functions of LLVM's math operations or https://llvm.org/docs/LangRef.html#standard-c-c-library-intrinsics TODO in future patches: - add preservation of debug info in FPBuiltinFnSelection; - moved tests from CodeGen to Transform - move pass to new pass manager Signed-off-by: Sidorov, Dmitry --------- Signed-off-by: Sidorov, Dmitry --- .../Scalar/FPBuiltinFnSelection.cpp | 63 ++++- .../fp-builtin-intrinsics-nvvm-approx.ll | 94 ++++++++ ...p-builtin-intrinsics-nvvm-max-error-0.5.ll | 219 ++++++++++++++++++ 3 files changed, 372 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll create mode 100644 llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll diff --git a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp index dd423b42ab0ec..b64241d2fd809 100644 --- a/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp +++ b/llvm/lib/Transforms/Scalar/FPBuiltinFnSelection.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/InitializePasses.h" #include "llvm/Support/FormatVariadic.h" @@ -106,6 +107,51 @@ static bool replaceWithLLVMIR(FPBuiltinIntrinsic &BuiltinCall) { return true; } +// This function lowers llvm.fpbuiltin. intrinsic functions with max-error +// attribute to the appropriate nvvm approximate intrinsics if it's possible. +// If it's not possible - fallback to instruction or standard C/C++ library LLVM +// intrinsic. +static bool +replaceWithApproxNVPTXCallsOrFallback(FPBuiltinIntrinsic &BuiltinCall, + std::optional Accuracy) { + IRBuilder<> IRBuilder(&BuiltinCall); + SmallVector Args(BuiltinCall.args()); + Value *Replacement = nullptr; + auto *Type = BuiltinCall.getType(); + // For now only add lowering for fdiv and sqrt. Yet nvvm intrinsics have + // approximate variants for sin, cos, exp2 and log2. + // For vector fpbuiltins for NVPTX target we don't have nvvm intrinsics, + // fallback to instruction or standard C/C++ library LLVM intrinsic. Also + // nvvm fdiv and sqrt intrisics support only float type, so fallback in this + // case as well. + switch (BuiltinCall.getIntrinsicID()) { + case Intrinsic::fpbuiltin_fdiv: + if (Accuracy.value() < 2.0) + return false; + if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) + return replaceWithLLVMIR(BuiltinCall); + Replacement = + IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_div_approx_f, Args); + break; + case Intrinsic::fpbuiltin_sqrt: + if (Accuracy.value() < 1.0) + return false; + if (Type->isVectorTy() || !Type->getScalarType()->isFloatTy()) + return replaceWithLLVMIR(BuiltinCall); + Replacement = + IRBuilder.CreateIntrinsic(Type, Intrinsic::nvvm_sqrt_approx_f, Args); + break; + default: + return false; + } + BuiltinCall.replaceAllUsesWith(Replacement); + cast(Replacement)->copyFastMathFlags(&BuiltinCall); + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Replaced call to `" + << BuiltinCall.getCalledFunction()->getName() + << "` with equivalent IR. \n `"); + return true; +} + static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, FPBuiltinIntrinsic &BuiltinCall) { @@ -136,10 +182,11 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, return replaceWithLLVMIR(BuiltinCall); // Several functions for "sycl" and "cuda" requires "0.5" accuracy levels, - // which means correctly rounded results. For now x86 host AltMathLibrary - // doesn't have such ability. For such accuracy level, the fpbuiltins - // should be replaced by equivalent IR operation or llvmbuiltins. - if (T.isX86() && BuiltinCall.getRequiredAccuracy().value() == 0.5) { + // which means correctly rounded results. For now x86 host and NVPTX + // AltMathLibrary doesn't have such ability. For such accuracy level, the + // fpbuiltins should be replaced by equivalent IR operation or llvmbuiltins. + if ((T.isX86() || T.isNVPTX()) && + BuiltinCall.getRequiredAccuracy().value() == 0.5) { switch (BuiltinCall.getIntrinsicID()) { case Intrinsic::fpbuiltin_fadd: case Intrinsic::fpbuiltin_fsub: @@ -154,6 +201,14 @@ static bool selectFnForFPBuiltinCalls(const TargetLibraryInfo &TLI, } } + // AltMathLibrary don't have implementation for CUDA approximate precision + // builtins. Lets map them on NVPTX intrinsics. If no appropriate intrinsics + // are known - skip to emit an error. + if (T.isNVPTX() && BuiltinCall.getRequiredAccuracy().value() > 0.5) + if (replaceWithApproxNVPTXCallsOrFallback( + BuiltinCall, BuiltinCall.getRequiredAccuracy())) + return true; + /// Call TLI to select a function implementation to call StringRef ImplName = TLI.selectFPBuiltinImplementation(&BuiltinCall); if (ImplName.empty()) { diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll new file mode 100644 index 0000000000000..6c7ce8af804d9 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-approx.ll @@ -0,0 +1,94 @@ +; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @test_fdiv +; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0 + %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0 + ret void +} + +; CHECK-LABEL: @test_fdiv_fast +; CHECK: %{{.*}} = call fast float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv fast <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv_fast(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call fast float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #0 + %t1 = call fast <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #0 + ret void +} + +; CHECK-LABEL: @test_fdiv_max_error +; CHECK: %{{.*}} = call float @llvm.nvvm.div.approx.f(float %{{.*}}, float %{{.*}}) +; CHECK: %{{.*}} = fdiv <2 x float> %{{.*}}, %{{.*}} +define void @test_fdiv_max_error(float %d1, <2 x float> %v2d1, + float %d2, <2 x float> %v2d2) { +entry: + %t0 = call float @llvm.fpbuiltin.fdiv.f32(float %d1, float %d2) #2 + %t1 = call <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float> %v2d1, <2 x float> %v2d2) #2 + ret void +} + +declare float @llvm.fpbuiltin.fdiv.f32(float, float) +declare <2 x float> @llvm.fpbuiltin.fdiv.v2f32(<2 x float>, <2 x float>) + +; CHECK-LABEL: @test_fdiv_double +; CHECK: %{{.*}} = fdiv double %{{.*}}, %{{.*}} +; CHECK: %{{.*}} = fdiv <2 x double> %{{.*}}, %{{.*}} +define void @test_fdiv_double(double %d1, <2 x double> %v2d1, + double %d2, <2 x double> %v2d2) { +entry: + %t0 = call double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0 + %t1 = call <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + ret void +} + +declare double @llvm.fpbuiltin.fdiv.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>) + +; CHECK-LABEL: @test_sqrt +; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}}) +; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +define void @test_sqrt(float %d, <2 x float> %v2d, <4 x float> %v4d) { +entry: + %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #1 + %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #1 + ret void +} + +; CHECK-LABEL: @test_sqrt_max_error +; CHECK: %{{.*}} = call float @llvm.nvvm.sqrt.approx.f(float %{{.*}}) +; CHECK: %{{.*}} = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +define void @test_sqrt_max_error(float %d, <2 x float> %v2d, <4 x float> %v4d) { +entry: + %t0 = call float @llvm.fpbuiltin.sqrt.f32(float %d) #2 + %t1 = call <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float> %v2d) #2 + ret void +} + +declare float @llvm.fpbuiltin.sqrt.f32(float) +declare <2 x float> @llvm.fpbuiltin.sqrt.v2f32(<2 x float>) + +; CHECK-LABEL: @test_sqrt_double +; CHECK: %{{.*}} = call double @llvm.sqrt.f64(double %{{.*}}) +; CHECK: %{{.*}} = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{.*}}) +define void @test_sqrt_double(double %d, <2 x double> %v2d) { +entry: + %t0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #1 + %t1 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #1 + ret void +} + +declare double @llvm.fpbuiltin.sqrt.f64(double) +declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) + +attributes #0 = { "fpbuiltin-max-error"="2.5" } +attributes #1 = { "fpbuiltin-max-error"="3.0" } +attributes #2 = { "fpbuiltin-max-error"="10.0" } diff --git a/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll new file mode 100644 index 0000000000000..3777ad8c52b6d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fp-builtin-intrinsics-nvvm-max-error-0.5.ll @@ -0,0 +1,219 @@ +; RUN: opt -fpbuiltin-fn-selection -S < %s | FileCheck %s + +; Several functions for SYCL and CUDA requires "0.5" accuracy levels, +; Test if these fpbuiltins could be replaced by equivalaent IR operations +; or LLVM builtins. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: @svml_fadd +; CHECK: %0 = fadd fast float %f1, %f2 +; CHECK: %1 = fadd fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fadd fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fadd fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fadd fast double %d1, %d2 +; CHECK: %5 = fadd fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fadd fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fadd fast <8 x double> %v8d1, %v8d2 +define void @svml_fadd(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fadd.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fadd.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fadd.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fadd.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fadd.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fadd.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fadd.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fadd.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fadd.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fadd.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fsub +; CHECK: %0 = fsub fast float %f1, %f2 +; CHECK: %1 = fsub fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fsub fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fsub fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fsub fast double %d1, %d2 +; CHECK: %5 = fsub fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fsub fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fsub fast <8 x double> %v8d1, %v8d2 +define void @svml_fsub(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fsub.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fsub.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fsub.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fsub.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fsub.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fsub.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fsub.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fsub.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fsub.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fsub.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fmul +; CHECK: %0 = fmul fast float %f1, %f2 +; CHECK: %1 = fmul fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = fmul fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = fmul fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = fmul fast double %d1, %d2 +; CHECK: %5 = fmul fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = fmul fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = fmul fast <8 x double> %v8d1, %v8d2 +define void @svml_fmul(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.fmul.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.fmul.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.fmul.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.fmul.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.fmul.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.fmul.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.fmul.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fmul.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fmul.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fmul.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_fdiv +; CHECK: %0 = fdiv fast double %d1, %d2 +; CHECK: %1 = fdiv fast <2 x double> %v2d1, %v2d2 +; CHECK: %2 = fdiv fast <4 x double> %v4d1, %v4d2 +; CHECK: %3 = fdiv fast <8 x double> %v8d1, %v8d2 +define void @svml_fdiv(double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast double @llvm.fpbuiltin.fdiv.f64(double %d1, double %d2) #0 + %t1_0 = call fast <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t2_0 = call fast <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t3_0 = call fast <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare double @llvm.fpbuiltin.fdiv.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.fdiv.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.fdiv.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.fdiv.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_frem +; CHECK: %0 = frem fast float %f1, %f2 +; CHECK: %1 = frem fast <4 x float> %v4f1, %v4f2 +; CHECK: %2 = frem fast <8 x float> %v8f1, %v8f2 +; CHECK: %3 = frem fast <16 x float> %v16f1, %v16f2 +; CHECK: %4 = frem fast double %d1, %d2 +; CHECK: %5 = frem fast <2 x double> %v2d1, %v2d2 +; CHECK: %6 = frem fast <4 x double> %v4d1, %v4d2 +; CHECK: %7 = frem fast <8 x double> %v8d1, %v8d2 +define void @svml_frem(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + float %f2, <4 x float> %v4f2, <8 x float> %v8f2, <16 x float> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + double %d2, <2 x double> %v2d2, <4 x double> %v4d2, <8 x double> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.frem.f32(float %f1, float %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float> %v4f1, <4 x float> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float> %v8f1, <8 x float> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float> %v16f1, <16 x float> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.frem.f64(double %d1, double %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double> %v2d1, <2 x double> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double> %v4d1, <4 x double> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double> %v8d1, <8 x double> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.frem.f32(float, float) +declare <4 x float> @llvm.fpbuiltin.frem.v4f32(<4 x float>, <4 x float>) +declare <8 x float> @llvm.fpbuiltin.frem.v8f32(<8 x float>, <8 x float>) +declare <16 x float> @llvm.fpbuiltin.frem.v16f32(<16 x float>, <16 x float>) +declare double @llvm.fpbuiltin.frem.f64(double, double) +declare <2 x double> @llvm.fpbuiltin.frem.v2f64(<2 x double>, <2 x double>) +declare <4 x double> @llvm.fpbuiltin.frem.v4f64(<4 x double>, <4 x double>) +declare <8 x double> @llvm.fpbuiltin.frem.v8f64(<8 x double>, <8 x double>) + +; CHECK-LABEL: @svml_sqrt +; CHECK: %0 = call double @llvm.sqrt.f64(double %d) +; CHECK: %1 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %v2d) +; CHECK: %2 = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %v4d) +; CHECK: %3 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %v8d) +define void @svml_sqrt(double %d, <2 x double> %v2d, <4 x double> %v4d, <8 x double> %v8d) { +entry: + %t4_0 = call double @llvm.fpbuiltin.sqrt.f64(double %d) #0 + %t5_0 = call <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double> %v2d) #0 + %t6_0 = call <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double> %v4d) #0 + %t7_0 = call <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double> %v8d) #0 + ret void +} + +declare double @llvm.fpbuiltin.sqrt.f64(double) +declare <2 x double> @llvm.fpbuiltin.sqrt.v2f64(<2 x double>) +declare <4 x double> @llvm.fpbuiltin.sqrt.v4f64(<4 x double>) +declare <8 x double> @llvm.fpbuiltin.sqrt.v8f64(<8 x double>) + +; CHECK-LABEL: @svml_ldexp +; CHECK: %0 = call fast float @llvm.ldexp.f32.i32(float %f1, i32 %f2) +; CHECK: %1 = call fast <4 x float> @llvm.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) +; CHECK: %2 = call fast <8 x float> @llvm.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) +; CHECK: %3 = call fast <16 x float> @llvm.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) +; CHECK: %4 = call fast double @llvm.ldexp.f64.i32(double %d1, i32 %d2) +; CHECK: %5 = call fast <2 x double> @llvm.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) +; CHECK: %6 = call fast <4 x double> @llvm.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) +; CHECK: %7 = call fast <8 x double> @llvm.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) +define void @svml_ldexp(float %f1, <4 x float> %v4f1, <8 x float> %v8f1, <16 x float> %v16f1, + i32 %f2, <4 x i32> %v4f2, <8 x i32> %v8f2, <16 x i32> %v16f2, + double %d1, <2 x double> %v2d1, <4 x double> %v4d1, <8 x double> %v8d1, + i32 %d2, <2 x i32> %v2d2, <4 x i32> %v4d2, <8 x i32> %v8d2) { +entry: + %t0_0 = call fast float @llvm.fpbuiltin.ldexp.f32.i32(float %f1, i32 %f2) #0 + %t1_0 = call fast <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float> %v4f1, <4 x i32> %v4f2) #0 + %t2_0 = call fast <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float> %v8f1, <8 x i32> %v8f2) #0 + %t3_0 = call fast <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float> %v16f1, <16 x i32> %v16f2) #0 + %t4_0 = call fast double @llvm.fpbuiltin.ldexp.f64.i32(double %d1, i32 %d2) #0 + %t5_0 = call fast <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double> %v2d1, <2 x i32> %v2d2) #0 + %t6_0 = call fast <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double> %v4d1, <4 x i32> %v4d2) #0 + %t7_0 = call fast <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double> %v8d1, <8 x i32> %v8d2) #0 + ret void +} + +declare float @llvm.fpbuiltin.ldexp.f32.i32(float, i32) +declare <4 x float> @llvm.fpbuiltin.ldexp.v4f32.v4i32(<4 x float>, <4 x i32>) +declare <8 x float> @llvm.fpbuiltin.ldexp.v8f32.v8i32(<8 x float>, <8 x i32>) +declare <16 x float> @llvm.fpbuiltin.ldexp.v16f32.v16i32(<16 x float>, <16 x i32>) +declare double @llvm.fpbuiltin.ldexp.f64.i32(double, i32) +declare <2 x double> @llvm.fpbuiltin.ldexp.v2f64.v2i32(<2 x double>, <2 x i32>) +declare <4 x double> @llvm.fpbuiltin.ldexp.v4f64.v4i32(<4 x double>, <4 x i32>) +declare <8 x double> @llvm.fpbuiltin.ldexp.v8f64.v8i32(<8 x double>, <8 x i32>) + +attributes #0 = { "fpbuiltin-max-error"="0.5" }