diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 57d3bc03bc264..d70896c30ba4b 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -513,12 +513,18 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name, // TODO: Replace AttrList with a single attribute. The call can only have a // single FPAccuracy attribute. llvm::AttributeList AttrList; + // "sycl_used_aspects" metadata associated with the call. + llvm::Metadata *AspectMD = nullptr; // sincos() doesn't return a value, but it still has a type associated with // it that corresponds to the operand type. CGF.CGM.getFPAccuracyFuncAttributes( - Name, AttrList, ID, + Name, AttrList, AspectMD, ID, Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType()); CI->setAttributes(AttrList); + + if (CGF.getLangOpts().SYCLIsDevice && AspectMD) + CI->setMetadata("sycl_used_aspects", + llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectMD)); return CI; } @@ -22144,21 +22150,22 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( // Even if the current function doesn't have a clang builtin, create // an 'fpbuiltin-max-error' attribute for it; unless it's marked with // an NoBuiltin attribute. - if (!FD->hasAttr()) { - Name = FD->getName(); - FPAccuracyIntrinsicID = - llvm::StringSwitch(Name) - .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd) - .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv) - .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul) - .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub) - .Case("frem", llvm::Intrinsic::fpbuiltin_frem) - .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos) - .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10) - .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt); - } else { + if (FD->hasAttr() || + !FD->getNameInfo().getName().isIdentifier()) return nullptr; - } + + Name = FD->getName(); + FPAccuracyIntrinsicID = + llvm::StringSwitch(Name) + .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd) + .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv) + .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul) + .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub) + .Case("frem", llvm::Intrinsic::fpbuiltin_frem) + .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos) + .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10) + .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt) + .Default(0); } else { // The function has a clang builtin. Create an attribute for it // only if it has an fpbuiltin intrinsic. @@ -22238,6 +22245,9 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( break; } } + if (!FPAccuracyIntrinsicID) + return nullptr; + Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType()); return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs), FPAccuracyIntrinsicID); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 866c7d72ad6f1..15b84cb73875d 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -17,6 +17,7 @@ #include "CGCXXABI.h" #include "CGCleanup.h" #include "CGRecordLayout.h" +#include "CGSYCLRuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "TargetInfo.h" @@ -1846,9 +1847,21 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) { .Case("cuda", llvm::fp::FPAccuracy::CUDA); } +static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) { + assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") || + FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") || + FPAccuracyStr.equals("cuda")); + return llvm::StringSwitch(FPAccuracyStr) + .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high) + .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium) + .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low) + .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl) + .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda); +} + void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( - StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID, - const llvm::Type *FuncType) { + StringRef Name, llvm::AttrBuilder &FuncAttrs, llvm::Metadata *&MD, + unsigned ID, const llvm::Type *FuncType) { // Priority is given to to the accuracy specific to the function. // So, if the command line is something like this: // 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'. @@ -1864,6 +1877,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( ID, FuncType, convertFPAccuracy(FuncMapIt->second)); assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal); + MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))); } } if (FuncAttrs.attrs().size() == 0) @@ -1872,6 +1887,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes( ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal)); assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected"); FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal); + MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get( + Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))); } } diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h index 71bc45599516f..85a18e4bb590e 100644 --- a/clang/lib/CodeGen/CGSYCLRuntime.h +++ b/clang/lib/CodeGen/CGSYCLRuntime.h @@ -23,6 +23,19 @@ namespace CodeGen { class CodeGenModule; +// These aspects are internal and used for device image splitting purposes only. +// They are not exposed to the SYCL users through "aspect" enum. That's why +// they are intentionally assigned negative values to filter them out at the +// stage of embedding used aspects as device requirements to the executable. +// We don't pass these internal aspects to the SYCL RT. +enum SYCLInternalAspect : int32_t { + fp_intrinsic_accuracy_high = -1, + fp_intrinsic_accuracy_medium = -2, + fp_intrinsic_accuracy_low = -3, + fp_intrinsic_accuracy_sycl = -4, + fp_intrinsic_accuracy_cuda = -5, +}; + class CGSYCLRuntime { protected: CodeGenModule &CGM; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 475f8cf794424..9f1dc07343067 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -7902,10 +7902,11 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) { void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name, llvm::AttributeList &AttrList, + llvm::Metadata *&MD, unsigned ID, const llvm::Type *FuncType) { llvm::AttrBuilder FuncAttrs(getLLVMContext()); - getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType); + getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MD, ID, FuncType); AttrList = llvm::AttributeList::get( getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs); } diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 5d1521da2da63..56e9a9358ba3e 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1594,7 +1594,8 @@ class CodeGenModule : public CodeGenTypeCache { void moveLazyEmissionStates(CodeGenModule *NewBuilder); void getFPAccuracyFuncAttributes(StringRef Name, - llvm::AttributeList &AttrList, unsigned ID, + llvm::AttributeList &AttrList, + llvm::Metadata *&MDs, unsigned ID, const llvm::Type *FuncType); private: @@ -1793,7 +1794,7 @@ class CodeGenModule : public CodeGenTypeCache { void getDefaultFunctionFPAccuracyAttributes(StringRef Name, llvm::AttrBuilder &FuncAttrs, - unsigned ID, + llvm::Metadata *&MD, unsigned ID, const llvm::Type *FuncType); llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map, diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp new file mode 100644 index 0000000000000..322b7f8ac65a7 --- /dev/null +++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp @@ -0,0 +1,127 @@ +// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-FUNC %s +// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-TU %s +// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-MIX %s + +// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option. + +#include "sycl.hpp" + +extern "C" SYCL_EXTERNAL double sin(double); +extern "C" SYCL_EXTERNAL double cos(double); +extern "C" SYCL_EXTERNAL double tan(double); +extern "C" SYCL_EXTERNAL double log(double); +extern "C" SYCL_EXTERNAL double exp(double); +extern "C" SYCL_EXTERNAL double acos(double); +extern "C" SYCL_EXTERNAL double asin(double); +extern "C" SYCL_EXTERNAL double sqrt(double); + +using namespace sycl; + +int main() { + const unsigned array_size = 4; + double Value = .5; + queue deviceQueue; + range<1> numOfItems{array_size}; + + // Kernel0 doesn't use math functions. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { + (void)Value; + }); + }); + + // Kernel1 uses high-accuracy sin. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]] +// CHECK-TU: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]] +// CHECK-MIX: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]] + (void)sin(Value); + }); + }); + + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]] +// CHECK-TU: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]] + (void)cos(Value); + }); + }); + + // Kernel3 uses low-accuracy tan. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[LOW_ACC:[0-9]+]] +// CHECK-TU: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]] + (void)tan(Value); + }); + }); + + // Kernel4 uses cuda-accuracy exp and sycl-accuracy log. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]] +// CHECK-FUNC: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]] +// CHECK-TU: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-TU: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]] +// CHECK-MIX: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]] + (void)log(exp(Value)); + }); + }); + deviceQueue.wait(); + + // Kernel5 uses cuda-accuracy acos. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC]] +// CHECK-TU: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]] + (void)acos(Value); + }); + }); + + // Kernel6 uses sycl-accuracy asin. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC]] +// CHECK-TU: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]] + (void)asin(Value); + }); + }); + + // Kernel7 uses high-accuracy sqrt. + deviceQueue.submit([&](handler& cgh) { + cgh.parallel_for(numOfItems, + [=](id<1> wiID) { +// CHECK-FUNC: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-TU: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] +// CHECK-MIX: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]] + (void)sqrt(Value); + }); + }); + return 0; +} + +// CHECK-FUNC: [[HIGH_ACC]] = !{i32 -1} +// CHECK-FUNC: [[MEDIUM_ACC]] = !{i32 -2} +// CHECK-FUNC: [[LOW_ACC]] = !{i32 -3} +// CHECK-FUNC: [[CUDA_ACC]] = !{i32 -5} +// CHECK-FUNC: [[SYCL_ACC]] = !{i32 -4} + +// CHECK-TU: [[HIGH_ACC]] = !{i32 -1} + +// CHECK-MIX: [[HIGH_ACC]] = !{i32 -1} +// CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2} +// CHECK-MIX: [[CUDA_ACC]] = !{i32 -5} +// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4} diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp index e846b9141d63c..4b37c267f2353 100644 --- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp @@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I, Result.insert(Aspects.begin(), Aspects.end()); } + if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) { + for (const MDOperand &MDOp : InstApsects->operands()) { + const Constant *C = cast(MDOp)->getValue(); + Result.insert(cast(C)->getSExtValue()); + } + } + return Result; } diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll new file mode 100644 index 0000000000000..47df6a804eabb --- /dev/null +++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll @@ -0,0 +1,66 @@ +; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s +; +; Test checks that the pass is able to propagate information about aspects +; used in the instruction through a call graph +; +; K1 K2 +; / \/ \ +; F1 F2 F3 +; +; F1 doesn't use optional type and doesn't have instruction with attached 'sycl_used_aspects' metadata. +; F2 uses optional A and has instruction with attached 'sycl_used_aspects' metadata. +; F3 uses optional B and has instruction with attached 'sycl_used_aspects' metadata. + +%Optional.A = type { i32 } +%Optional.B = type { i32 } + +; CHECK: spir_kernel void @kernel1() !sycl_used_aspects ![[#ID1:]] +define spir_kernel void @kernel1() { + call spir_func void @func1() + call spir_func void @func2() + ret void +} + +; CHECK: spir_kernel void @kernel2() !sycl_used_aspects ![[#ID2:]] +define spir_kernel void @kernel2() { + call spir_func void @func2() + call spir_func void @func3() + ret void +} + +; CHECK: spir_func void @func1() { +define spir_func void @func1() { + %tmp = alloca i32 + ret void +} + +declare void @llvm.fpbuiltin.f64() + +; CHECK: spir_func void @func2() !sycl_used_aspects ![[#ID1]] { +define spir_func void @func2() { + %tmp1 = alloca %Optional.A + call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !3 + ret void +} + +; CHECK: spir_func void @func3() !sycl_used_aspects ![[#ID3:]] { +define spir_func void @func3() { + %tmp = alloca %Optional.B + call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !4 + ret void +} + +!sycl_types_that_use_aspects = !{!0, !1} +!0 = !{!"Optional.A", i32 1} +!1 = !{!"Optional.B", i32 2} + +!sycl_aspects = !{!2} +!2 = !{!"fp64", i32 6} +!3 = !{i32 -1} +!4 = !{i32 -2} + +; CHECK: ![[#ID1]] = !{i32 1, i32 -1} +; CHECK: ![[#ID2]] = !{i32 1, i32 -1, i32 2, i32 -2} +; CHECK: ![[#ID3]] = !{i32 2, i32 -2} + + diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 4aa28dc4ff643..a9c791877a079 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements( const module_split::ModuleDesc &MD, std::map &Requirements) { auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N, - unsigned OpNo) -> unsigned { + unsigned OpNo) -> int32_t { Constant *C = cast(N->getOperand(OpNo).get())->getValue(); - return static_cast(C->getUniqueInteger().getZExtValue()); + return static_cast(C->getUniqueInteger().getSExtValue()); }; // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see: @@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements( std::set Values; for (const Function &F : MD.getModule()) { if (const MDNode *MDN = F.getMetadata(MDName)) { - for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) - Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I)); + for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) { + // Don't put internal aspects (with negative integer value) into the + // requirements, they are used only for device image splitting. + auto Val = ExtractIntegerFromMDNodeOperand(MDN, I); + if (Val >= 0) + Values.insert(Val); + } } } + // We don't need the "fixed_target" property if it's empty if (std::string(MDName) == "sycl_fixed_targets" && Values.empty()) continue; @@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements( if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) { assert(MDN->getNumOperands() == 1); auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0); + assert(MDValue >= 0); if (!SubGroupSize) SubGroupSize = MDValue; else - assert(*SubGroupSize == MDValue); + assert(*SubGroupSize == static_cast(MDValue)); } } // Do not attach reqd_sub_group_size if there is no attached metadata diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md index 3bb601abb12c6..c3a1202d4ec6c 100644 --- a/sycl/doc/design/OptionalDeviceFeatures.md +++ b/sycl/doc/design/OptionalDeviceFeatures.md @@ -553,7 +553,8 @@ type because the front-end does not include that type in the `!sycl_types_that_use_aspects` set. If a function references the `double` type, the implementation implicitly assumes that the function uses `aspect::fp64` and adds that aspect to the function's `!sycl_used_aspects` -set. +set. If `!sycl_used_aspects` is attached to instruction then it is also added +to the function's `!sycl_used_aspects` set. **NOTE**: This scan of the IR will require comparing the type referenced by each IR instruction with the names of the types in the @@ -1148,6 +1149,24 @@ Kernel has a required sub-group size of '32' but device does not support this sub-group size. ``` +### SYCL internal aspects for device image splitting + +There are scenarios when we would like to split device images based on +optional kernel features but we don't want to expose corresponding +aspects to the user. Internal SYCL aspects are used for this purpose. + +To differentiate them from regular aspects, internal aspects are assigned +negative values. If optional feature is used in the kernel then SYCL +device compiler adds value of internal aspect to 'sycl_used_aspects' metadata, +it gets propagated through the call graph and participates in device image +splitting together with regular aspects but it's not passed to the SYCL runtime, +it is filtered out when generating a set of device requirements. + +New value can be added to 'SYCLInternalAspect' enum to introduce new internal +aspect. + +Example of internal aspects usage is splitting device images based on floating +point accuracy level for math functions provided by user using -ffp-accuracy option. ## Appendix: Adding an attribute to 8-byte `atomic_ref` diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp new file mode 100644 index 0000000000000..80acc2baa893f --- /dev/null +++ b/sycl/test/optional_kernel_features/fp-accuracy.cpp @@ -0,0 +1,226 @@ +// Tests that kernels which use different fp-accuracy level end up in different +// device images. + +// 1. Accuracy is specified for particular math functions. +// RUN: %clangxx %s -o %test_func.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin -fno-math-errno -fsycl -fsycl-device-only +// RUN: sycl-post-link -split=auto -symbols %test_func.bc -o %test_func.table +// RUN: FileCheck %s -input-file=%test_func.table --check-prefixes CHECK-FUNC-TABLE +// RUN: FileCheck %s -input-file=%test_func_0.sym --check-prefixes CHECK-FUNC-M0-SYMS +// RUN: FileCheck %s -input-file=%test_func_1.sym --check-prefixes CHECK-FUNC-M1-SYMS +// RUN: FileCheck %s -input-file=%test_func_2.sym --check-prefixes CHECK-FUNC-M2-SYMS +// RUN: FileCheck %s -input-file=%test_func_3.sym --check-prefixes CHECK-FUNC-M3-SYMS +// RUN: FileCheck %s -input-file=%test_func_4.sym --check-prefixes CHECK-FUNC-M4-SYMS +// RUN: FileCheck %s -input-file=%test_func_5.sym --check-prefixes CHECK-FUNC-M5-SYMS + +// 2. Accuracy is specified for TU. +// RUN: %clangxx %s -o %test_tu.bc -ffp-accuracy=high -fno-math-errno -fsycl -fsycl-device-only +// RUN: sycl-post-link -split=auto -symbols %test_tu.bc -o %test_tu.table +// RUN: FileCheck %s -input-file=%test_tu.table --check-prefixes CHECK-TU-TABLE +// RUN: FileCheck %s -input-file=%test_tu_0.sym --check-prefixes CHECK-TU-M0-SYMS +// RUN: FileCheck %s -input-file=%test_tu_1.sym --check-prefixes CHECK-TU-M1-SYMS + +// 3. Mixed case. +// RUN: %clangxx %s -o %test_mix.bc -ffp-accuracy=medium -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=cuda:exp -ffp-accuracy=sycl:log -fno-math-errno -fsycl -fsycl-device-only +// RUN: sycl-post-link -split=auto -symbols %test_mix.bc -o %test_mix.table +// RUN: FileCheck %s -input-file=%test_mix.table --check-prefixes CHECK-MIX-TABLE +// RUN: FileCheck %s -input-file=%test_mix_0.sym --check-prefixes CHECK-MIX-M0-SYMS +// RUN: FileCheck %s -input-file=%test_mix_1.sym --check-prefixes CHECK-MIX-M1-SYMS +// RUN: FileCheck %s -input-file=%test_mix_2.sym --check-prefixes CHECK-MIX-M2-SYMS +// RUN: FileCheck %s -input-file=%test_mix_3.sym --check-prefixes CHECK-MIX-M3-SYMS + +// CHECK-FUNC-TABLE: Code +// CHECK-FUNC-TABLE-NEXT: _0.sym +// CHECK-FUNC-TABLE-NEXT: _1.sym +// CHECK-FUNC-TABLE-NEXT: _2.sym +// CHECK-FUNC-TABLE-NEXT: _3.sym +// CHECK-FUNC-TABLE-NEXT: _4.sym +// CHECK-FUNC-TABLE-NEXT: _5.sym +// CHECK-FUNC-TABLE-NEXT: _6.sym +// CHECK-FUNC-TABLE-EMPTY: + +// CHECK-TU-TABLE: Code +// CHECK-TU-TABLE-NEXT: _0.sym +// CHECK-TU-TABLE-NEXT: _1.sym +// CHECK-TU-TABLE-EMPTY: + +// CHECK-MIX-TABLE: Code +// CHECK-MIX-TABLE-NEXT: _0.sym +// CHECK-MIX-TABLE-NEXT: _1.sym +// CHECK-MIX-TABLE-NEXT: _2.sym +// CHECK-MIX-TABLE-NEXT: _3.sym +// CHECK-MIX-TABLE-EMPTY: + +// CHECK-FUNC-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1 +// CHECK-FUNC-M0-SYMS-NEXT: Kernel1 +// CHECK-FUNC-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7 +// CHECK-FUNC-M0-SYMS-NEXT: Kernel7 +// CHECK-FUNC-M0-SYMS-EMPTY: + +// CHECK-FUNC-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2 +// CHECK-FUNC-M1-SYMS-NEXT: Kernel2 +// CHECK-FUNC-M1-SYMS-EMPTY: + +// CHECK-FUNC-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3 +// CHECK-FUNC-M2-SYMS-NEXT: Kernel3 +// CHECK-FUNC-M2-SYMS-EMPTY: + +// CHECK-FUNC-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6 +// CHECK-FUNC-M3-SYMS-NEXT: Kernel6 +// CHECK-FUNC-M3-SYMS-EMPTY: + +// CHECK-FUNC-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4 +// CHECK-FUNC-M4-SYMS-NEXT: Kernel4 +// CHECK-FUNC-M4-SYMS-EMPTY: + +// CHECK-FUNC-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5 +// CHECK-FUNC-M5-SYMS-NEXT: Kernel5 +// CHECK-FUNC-M5-SYMS-EMPTY: + +// CHECK-FUNC-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0 +// CHECK-FUNC-M6-SYMS-NEXT: Kernel0 +// CHECK-FUNC-M6-SYMS-EMPTY: + +// CHECK-TU-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1 +// CHECK-TU-M0-SYMS-NEXT: Kernel1 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel2 +// CHECK-TU-M0-SYMS-NEXT: Kernel2 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3 +// CHECK-TU-M0-SYMS-NEXT: Kernel3 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel4 +// CHECK-TU-M0-SYMS-NEXT: Kernel4 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5 +// CHECK-TU-M0-SYMS-NEXT: Kernel5 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6 +// CHECK-TU-M0-SYMS-NEXT: Kernel6 +// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7 +// CHECK-TU-M0-SYMS-NEXT: Kernel7 +// CHECK-TU-M6-SYMS-EMPTY: + +// CHECK-TU-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel0 +// CHECK-TU-M1-SYMS-NEXT: Kernel0 +// CHECK-TU-M1-SYMS-EMPTY: + +// CHECK-MIX-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1 +// CHECK-MIX-M0-SYMS-NEXT: Kernel1 +// CHECK-MIX-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7 +// CHECK-MIX-M0-SYMS-NEXT: Kernel7 +// CHECK-MIX-M0-SYMS-EMPTY: + +// CHECK-MIX-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2 +// CHECK-MIX-M1-SYMS-NEXT: Kernel2 +// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3 +// CHECK-MIX-M1-SYMS-NEXT: Kernel3 +// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5 +// CHECK-MIX-M1-SYMS-NEXT: Kernel5 +// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6 +// CHECK-MIX-M1-SYMS-NEXT: Kernel6 +// CHECK-MIX-M1-SYMS-EMPTY: + +// CHECK-MIX-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel4 +// CHECK-MIX-M2-SYMS-NEXT: Kernel4 +// CHECK-MIX-M2-SYMS-EMPTY: + +// CHECK-MIX-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel0 +// CHECK-MIX-M3-SYMS-NEXT: Kernel0 +// CHECK-MIX-M3-SYMS-EMPTY: + +#include +#include +#include +#include + +using namespace sycl; + +constexpr access::mode sycl_read = access::mode::read; +constexpr access::mode sycl_write = access::mode::write; + +int main() { + const size_t array_size = 4; + std::array D = {{1., 2., 3., 4.}}, E; + queue deviceQueue; + range<1> numOfItems{array_size}; + double Value = 5.; + buffer bufferOut(E.data(), numOfItems); + + // Kernel0 doesn't use math functions. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; }); + }); + + // Kernel1 uses high-accuracy sin. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); }); + }); + + // Kernel2 uses: + // 1. medium-accuracy cos + // 2. high-accuracy cos + // 3. medium-accuracy cos + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); }); + }); + + // Kernel3 uses: + // 1. low-accuracy tan + // 2. high-accuracy tan + // 3. medium-accuracy tan. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); }); + }); + + // Kernel4 uses: + // 1. cuda-accuracy exp and sycl-accuracy log. + // 2. high-accuracy exp and high-accuracy log. + // 3. cuda-accuracy exp and sycl-accuracy log. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for(numOfItems, [=](id<1> wiID) { + accessorOut[wiID] = std::log(std::exp(Value)); + }); + }); + + // Kernel5 uses: + // 1. cuda-accuracy acos. + // 1. high-accuracy acos. + // 1. medium-accuracy acos. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); }); + }); + + // Kernel6 uses: + // 1. sycl-accuracy acos. + // 1. high-accuracy acos. + // 1. medium-accuracy acos. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); }); + }); + + // Kernel7 uses high-accuracy sqrt. + deviceQueue.submit([&](handler &cgh) { + auto accessorOut = bufferOut.template get_access(cgh); + + cgh.parallel_for( + numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); }); + }); + + return 0; +}