-
Notifications
You must be signed in to change notification settings - Fork 802
[SYCL] Split device images based on accuracy level provided in option #10140
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
94ac8d5
f235c44
649fd15
404f82e
d107740
a537ca9
edf01dd
c4b0c56
756690c
4dd7de1
9408518
2fb28d3
9807f64
a13c1ff
c3afa41
8e025fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
| #include "CGObjCRuntime.h" | ||
| #include "CGOpenCLRuntime.h" | ||
| #include "CGRecordLayout.h" | ||
| #include "CGSYCLRuntime.h" | ||
| #include "CodeGenFunction.h" | ||
| #include "CodeGenModule.h" | ||
| #include "ConstantEmitter.h" | ||
|
|
@@ -513,12 +514,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name, | |
| // TODO: Replace AttrList with a single attribute. The call can only have a | ||
| // single FPAccuracy attribute. | ||
| llvm::AttributeList AttrList; | ||
| // "sycl_used_aspects" metadata associated with the call. | ||
| SmallVector<llvm::Metadata *, 4> AspectsMD; | ||
|
||
| // sincos() doesn't return a value, but it still has a type associated with | ||
| // it that corresponds to the operand type. | ||
| CGF.CGM.getFPAccuracyFuncAttributes( | ||
| Name, AttrList, ID, | ||
| Name, AttrList, AspectsMD, ID, | ||
| Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType()); | ||
| CI->setAttributes(AttrList); | ||
| if (!AspectsMD.empty()) | ||
| CI->setMetadata("sycl_used_aspects", | ||
| llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD)); | ||
| return CI; | ||
| } | ||
|
|
||
|
|
@@ -22144,7 +22150,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( | |
| // Even if the current function doesn't have a clang builtin, create | ||
| // an 'fpbuiltin-max-error' attribute for it; unless it's marked with | ||
| // an NoBuiltin attribute. | ||
| if (!FD->hasAttr<NoBuiltinAttr>()) { | ||
| if (!FD->hasAttr<NoBuiltinAttr>() && | ||
| FD->getNameInfo().getName().isIdentifier()) { | ||
|
||
| Name = FD->getName(); | ||
| FPAccuracyIntrinsicID = | ||
| llvm::StringSwitch<unsigned>(Name) | ||
|
|
@@ -22155,7 +22162,11 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( | |
| .Case("frem", llvm::Intrinsic::fpbuiltin_frem) | ||
| .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos) | ||
| .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10) | ||
| .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt); | ||
| .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt) | ||
| .Default(0); | ||
| if (!FPAccuracyIntrinsicID) { | ||
|
||
| return nullptr; | ||
| } | ||
| } else { | ||
| return nullptr; | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| #include "CGCXXABI.h" | ||
| #include "CGCleanup.h" | ||
| #include "CGRecordLayout.h" | ||
| #include "CGSYCLRuntime.h" | ||
| #include "CodeGenFunction.h" | ||
| #include "CodeGenModule.h" | ||
| #include "TargetInfo.h" | ||
|
|
@@ -1846,8 +1847,18 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) { | |
| .Case("cuda", llvm::fp::FPAccuracy::CUDA); | ||
| } | ||
|
|
||
| static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to add an assert here to ensure this function is called with appropriate FPAccuracyStr? Thanks
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added assert, thanks. |
||
| return llvm::StringSwitch<int32_t>(FPAccuracyStr) | ||
| .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high) | ||
| .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium) | ||
| .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low) | ||
| .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl) | ||
| .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda); | ||
| } | ||
|
|
||
| void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( | ||
| StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID, | ||
| StringRef Name, llvm::AttrBuilder &FuncAttrs, | ||
| SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID, | ||
| const llvm::Type *FuncType) { | ||
| // Priority is given to to the accuracy specific to the function. | ||
| // So, if the command line is something like this: | ||
|
|
@@ -1864,6 +1875,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( | |
| ID, FuncType, convertFPAccuracy(FuncMapIt->second)); | ||
| assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); | ||
| FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal); | ||
| if (getLangOpts().SYCLIsDevice) | ||
|
||
| MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( | ||
| Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second)))); | ||
| } | ||
| } | ||
| if (FuncAttrs.attrs().size() == 0) | ||
|
|
@@ -1872,6 +1886,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( | |
| ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal)); | ||
| assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); | ||
| FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal); | ||
| if (getLangOpts().SYCLIsDevice) | ||
| MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( | ||
| Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal)))); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -5620,8 +5637,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, | |
| // Emit the actual call/invoke instruction. | ||
| llvm::CallBase *CI; | ||
| if (!InvokeDest) { | ||
| if (!getLangOpts().FPAccuracyFuncMap.empty() || | ||
| !getLangOpts().FPAccuracyVal.empty()) { | ||
| if ((!getLangOpts().FPAccuracyFuncMap.empty() || | ||
| !getLangOpts().FPAccuracyVal.empty()) && | ||
| isa_and_nonnull<FunctionDecl>(TargetDecl)) { | ||
|
||
| const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl); | ||
| assert(FD && "expecting a function"); | ||
| CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| // RUN: %clangxx %s -o %test.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin -fno-math-errno -fsycl -fsycl-device-only | ||
|
||
| // RUN: sycl-post-link -split=auto -symbols %test.bc -o %test.table | ||
| // RUN: FileCheck %s -input-file=%test.table --check-prefixes CHECK-TABLE | ||
| // RUN: FileCheck %s -input-file=%test_0.sym --check-prefixes CHECK-M0-SYMS | ||
| // RUN: FileCheck %s -input-file=%test_1.sym --check-prefixes CHECK-M1-SYMS | ||
| // RUN: FileCheck %s -input-file=%test_2.sym --check-prefixes CHECK-M2-SYMS | ||
| // RUN: FileCheck %s -input-file=%test_3.sym --check-prefixes CHECK-M3-SYMS | ||
| // RUN: FileCheck %s -input-file=%test_4.sym --check-prefixes CHECK-M4-SYMS | ||
| // RUN: FileCheck %s -input-file=%test_5.sym --check-prefixes CHECK-M5-SYMS | ||
|
|
||
| // Tests that kernels which use different fp-accuracy level end up in different | ||
| // device images. | ||
|
|
||
| // CHECK-TABLE: Code | ||
| // CHECK-TABLE-NEXT: _0.sym | ||
| // CHECK-TABLE-NEXT: _1.sym | ||
| // CHECK-TABLE-NEXT: _2.sym | ||
| // CHECK-TABLE-NEXT: _3.sym | ||
| // CHECK-TABLE-NEXT: _4.sym | ||
| // CHECK-TABLE-NEXT: _5.sym | ||
| // CHECK-TABLE-NEXT: _6.sym | ||
| // CHECK-TABLE-EMPTY: | ||
|
|
||
| // CHECK-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1 | ||
| // CHECK-M0-SYMS-NEXT: Kernel1 | ||
| // CHECK-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7 | ||
| // CHECK-M0-SYMS-NEXT: Kernel7 | ||
| // CHECK-M0-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2 | ||
| // CHECK-M1-SYMS-NEXT: Kernel2 | ||
| // CHECK-M1-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3 | ||
| // CHECK-M2-SYMS-NEXT: Kernel3 | ||
| // CHECK-M2-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6 | ||
| // CHECK-M3-SYMS-NEXT: Kernel6 | ||
| // CHECK-M3-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4 | ||
| // CHECK-M4-SYMS-NEXT: Kernel4 | ||
| // CHECK-M4-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5 | ||
| // CHECK-M5-SYMS-NEXT: Kernel5 | ||
| // CHECK-M5-SYMS-EMPTY: | ||
|
|
||
| // CHECK-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0 | ||
| // CHECK-M6-SYMS-NEXT: Kernel0 | ||
| // CHECK-M6-SYMS-EMPTY: | ||
|
|
||
| #include <array> | ||
| #include <cmath> | ||
| #include <iostream> | ||
| #include <sycl/sycl.hpp> | ||
|
|
||
| using namespace sycl; | ||
|
|
||
| constexpr access::mode sycl_read = access::mode::read; | ||
| constexpr access::mode sycl_write = access::mode::write; | ||
|
|
||
| int main() { | ||
| const size_t array_size = 4; | ||
| std::array<double, array_size> D = {{1., 2., 3., 4.}}, E; | ||
| queue deviceQueue; | ||
| range<1> numOfItems{array_size}; | ||
| double Value = 5.; | ||
| buffer<double, 1> bufferOut(E.data(), numOfItems); | ||
|
|
||
| // Kernel0 doesn't use math functions. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel0>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; }); | ||
| }); | ||
|
|
||
| // Kernel1 uses high-accuracy sin. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel1>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); }); | ||
| }); | ||
|
|
||
| // Kernel2 uses medium-accuracy cos. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel2>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); }); | ||
| }); | ||
|
|
||
| // Kernel3 uses low-accuracy tan. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel3>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); }); | ||
| }); | ||
|
|
||
| // Kernel4 uses cuda-accuracy exp and sycl-accuracy log. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel4>(numOfItems, [=](id<1> wiID) { | ||
| accessorOut[wiID] = std::log(std::exp(Value)); | ||
| }); | ||
| }); | ||
|
|
||
| // Kernel5 uses cuda-accuracy acos. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel5>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); }); | ||
| }); | ||
|
|
||
| // Kernel6 uses sycl-accuracy asin. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel6>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); }); | ||
| }); | ||
|
|
||
| // Kernel7 uses high-accuracy sqrt. | ||
| deviceQueue.submit([&](handler &cgh) { | ||
| auto accessorOut = bufferOut.template get_access<sycl_write>(cgh); | ||
|
|
||
| cgh.parallel_for<class Kernel7>( | ||
| numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); }); | ||
| }); | ||
|
|
||
| return 0; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need this include here? I don't see you using internal aspects in this file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, removed.