intel · againull · Jul 14, 2023 · Jun 20, 2023 · Jun 28, 2023 · Jul 7, 2023
@@ -16,6 +16,7 @@
 #include "CGObjCRuntime.h"
 #include "CGOpenCLRuntime.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
@@ -513,12 +514,17 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // TODO: Replace AttrList with a single attribute. The call can only have a
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
+  // "sycl_used_aspects" metadata associated with the call.
+  SmallVector<llvm::Metadata *, 4> AspectsMD;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, ID,
+      Name, AttrList, AspectsMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
+  if (!AspectsMD.empty())
+    CI->setMetadata("sycl_used_aspects",
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectsMD));
   return CI;
 }
 
@@ -22144,7 +22150,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
     // Even if the current function doesn't have a clang builtin, create
     // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
     // an NoBuiltin attribute.
-    if (!FD->hasAttr<NoBuiltinAttr>()) {
+    if (!FD->hasAttr<NoBuiltinAttr>() &&
+        FD->getNameInfo().getName().isIdentifier()) {
       Name = FD->getName();
       FPAccuracyIntrinsicID =
           llvm::StringSwitch<unsigned>(Name)
@@ -22155,7 +22162,11 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
               .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
               .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
               .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
-              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
+              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
+              .Default(0);
+      if (!FPAccuracyIntrinsicID) {
+        return nullptr;
+      }
     } else {
       return nullptr;
     }

@@ -17,6 +17,7 @@
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1846,8 +1847,18 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
       .Case("cuda", llvm::fp::FPAccuracy::CUDA);
 }
 
+static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  return llvm::StringSwitch<int32_t>(FPAccuracyStr)
+      .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
+      .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
+      .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
+      .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
+      .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
+}
+
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
+    StringRef Name, llvm::AttrBuilder &FuncAttrs,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
     const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
@@ -1864,6 +1875,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second))));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1872,6 +1886,9 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      if (getLangOpts().SYCLIsDevice)
+        MDs.push_back(llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal))));
     }
 }
 
@@ -5620,8 +5637,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Emit the actual call/invoke instruction.
   llvm::CallBase *CI;
   if (!InvokeDest) {
-    if (!getLangOpts().FPAccuracyFuncMap.empty() ||
-        !getLangOpts().FPAccuracyVal.empty()) {
+    if ((!getLangOpts().FPAccuracyFuncMap.empty() ||
+         !getLangOpts().FPAccuracyVal.empty()) &&
+        isa_and_nonnull<FunctionDecl>(TargetDecl)) {
       const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
       assert(FD && "expecting a function");
       CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);

@@ -23,6 +23,19 @@ namespace CodeGen {
 
 class CodeGenModule;
 
+// These aspects are internal and used for device image splitting purposes only.
+// They are not exposed to the DPCPP users through "aspect" enum. That's why
+// they are intentionally assigned negative values to filter them out at the
+// stage of embedding used aspects as device requirements to the executable.
+// We don't pass these internal aspects to the DPCPP RT.
+enum SYCLInternalAspect : int32_t {
+  fp_intrinsic_accuracy_high = -1,
+  fp_intrinsic_accuracy_medium = -2,
+  fp_intrinsic_accuracy_low = -3,
+  fp_intrinsic_accuracy_sycl = -4,
+  fp_intrinsic_accuracy_cuda = -5,
+};
+
 class CGSYCLRuntime {
 protected:
   CodeGenModule &CGM;

@@ -7882,12 +7882,12 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
 
-void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
-                                                llvm::AttributeList &AttrList,
-                                                unsigned ID,
-                                                const llvm::Type *FuncType) {
+void CodeGenModule::getFPAccuracyFuncAttributes(
+    StringRef Name, llvm::AttributeList &AttrList,
+    SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+    const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MDs, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
@@ -1594,8 +1594,9 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   void getFPAccuracyFuncAttributes(StringRef Name,
-                                   llvm::AttributeList &AttrList, unsigned ID,
-                                   const llvm::Type *FuncType);
+                                   llvm::AttributeList &AttrList,
+                                   SmallVector<llvm::Metadata *, 4> &MDs,
+                                   unsigned ID, const llvm::Type *FuncType);
 
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
@@ -1791,10 +1792,10 @@ class CodeGenModule : public CodeGenTypeCache {
                                     bool AttrOnCallSite,
                                     llvm::AttrBuilder &FuncAttrs);
 
-  void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
-                                              llvm::AttrBuilder &FuncAttrs,
-                                              unsigned ID,
-                                              const llvm::Type *FuncType);
+  void getDefaultFunctionFPAccuracyAttributes(
+      StringRef Name, llvm::AttrBuilder &FuncAttrs,
+      SmallVector<llvm::Metadata *, 4> &MDs, unsigned ID,
+      const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
                                                StringRef Suffix);

@@ -0,0 +1,102 @@
+// RUN: %clang_cc1  -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown -disable-llvm-passes %s -o - | FileCheck %s
+
+// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
+
+#include "Inputs/sycl.hpp"
+
+extern "C" SYCL_EXTERNAL double sin(double);
+extern "C" SYCL_EXTERNAL double cos(double);
+extern "C" SYCL_EXTERNAL double tan(double);
+extern "C" SYCL_EXTERNAL double log(double);
+extern "C" SYCL_EXTERNAL double exp(double);
+extern "C" SYCL_EXTERNAL double acos(double);
+extern "C" SYCL_EXTERNAL double asin(double);
+extern "C" SYCL_EXTERNAL double sqrt(double);
+
+using namespace sycl;
+
+int main() {
+  const unsigned array_size = 4;
+  double Value = .5;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel0>(numOfItems,
+    [=](id<1> wiID) {
+      (void)Value;
+    });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel1>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+      (void)sin(Value);
+    });
+  });
+
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel2>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT2:[0-9]+]]
+      (void)cos(Value);
+    });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel3>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT3:[0-9]+]]
+      (void)tan(Value);
+    });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel4>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
+// CHECK: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+      (void)log(exp(Value));
+    });
+  });
+  deviceQueue.wait();
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel5>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT4:[0-9]+]]
+      (void)acos(Value);
+    });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel6>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT5:[0-9]+]]
+      (void)asin(Value);
+    });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel7>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[ASPECT1:[0-9]+]]
+      (void)sqrt(Value);
+    });
+  });
+  return 0;
+}
+
+// CHECK: [[ASPECT1]] = !{i32 -1}
+// CHECK: [[ASPECT2]] = !{i32 -2}
+// CHECK: [[ASPECT3]] = !{i32 -3}
+// CHECK: [[ASPECT4]] = !{i32 -5}
+// CHECK: [[ASPECT5]] = !{i32 -4}
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
     Result.insert(Aspects.begin(), Aspects.end());
   }
 
+  if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
+    for (const MDOperand &MDOp : InstApsects->operands()) {
+      const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+      Result.insert(cast<ConstantInt>(C)->getSExtValue());
+    }
+  }
+
   return Result;
 }
 

@@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements(
     const module_split::ModuleDesc &MD,
     std::map<StringRef, util::PropertyValue> &Requirements) {
   auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N,
-                                             unsigned OpNo) -> unsigned {
+                                             unsigned OpNo) -> int32_t {
     Constant *C =
         cast<ConstantAsMetadata>(N->getOperand(OpNo).get())->getValue();
-    return static_cast<uint32_t>(C->getUniqueInteger().getZExtValue());
+    return static_cast<int32_t>(C->getUniqueInteger().getSExtValue());
   };
 
   // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
@@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements(
     std::set<uint32_t> Values;
     for (const Function &F : MD.getModule()) {
       if (const MDNode *MDN = F.getMetadata(MDName)) {
-        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
-          Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I));
+        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+          // Don't put internal aspects (with negative integer value) into the
+          // requirements, they are used only for device image splitting.
+          auto Val = ExtractIntegerFromMDNodeOperand(MDN, I);
+          if (Val >= 0)
+            Values.insert(Val);
+        }
       }
     }
+
     // We don't need the "fixed_target" property if it's empty
     if (std::string(MDName) == "sycl_fixed_targets" && Values.empty())
       continue;
@@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements(
     if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) {
       assert(MDN->getNumOperands() == 1);
       auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0);
+      assert(MDValue >= 0);
       if (!SubGroupSize)
         SubGroupSize = MDValue;
       else
-        assert(*SubGroupSize == MDValue);
+        assert(*SubGroupSize == static_cast<uint32_t>(MDValue));
     }
   }
   // Do not attach reqd_sub_group_size if there is no attached metadata