diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 57d3bc03bc264..d70896c30ba4b 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -513,12 +513,18 @@ static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
   // TODO: Replace AttrList with a single attribute. The call can only have a
   // single FPAccuracy attribute.
   llvm::AttributeList AttrList;
+  // "sycl_used_aspects" metadata associated with the call.
+  llvm::Metadata *AspectMD = nullptr;
   // sincos() doesn't return a value, but it still has a type associated with
   // it that corresponds to the operand type.
   CGF.CGM.getFPAccuracyFuncAttributes(
-      Name, AttrList, ID,
+      Name, AttrList, AspectMD, ID,
       Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
   CI->setAttributes(AttrList);
+
+  if (CGF.getLangOpts().SYCLIsDevice && AspectMD)
+    CI->setMetadata("sycl_used_aspects",
+                    llvm::MDNode::get(CGF.CGM.getLLVMContext(), AspectMD));
   return CI;
 }
 
@@ -22144,21 +22150,22 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
     // Even if the current function doesn't have a clang builtin, create
     // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
     // an NoBuiltin attribute.
-    if (!FD->hasAttr<NoBuiltinAttr>()) {
-      Name = FD->getName();
-      FPAccuracyIntrinsicID =
-          llvm::StringSwitch<unsigned>(Name)
-              .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
-              .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
-              .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
-              .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
-              .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
-              .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
-              .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
-              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
-    } else {
+    if (FD->hasAttr<NoBuiltinAttr>() ||
+        !FD->getNameInfo().getName().isIdentifier())
       return nullptr;
-    }
+
+    Name = FD->getName();
+    FPAccuracyIntrinsicID =
+        llvm::StringSwitch<unsigned>(Name)
+            .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
+            .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
+            .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
+            .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
+            .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
+            .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
+            .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
+            .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt)
+            .Default(0);
   } else {
     // The function has a clang builtin. Create an attribute for it
     // only if it has an fpbuiltin intrinsic.
@@ -22238,6 +22245,9 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
       break;
     }
   }
+  if (!FPAccuracyIntrinsicID)
+    return nullptr;
+
   Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
   return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
                                    FPAccuracyIntrinsicID);
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 866c7d72ad6f1..15b84cb73875d 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -17,6 +17,7 @@
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGRecordLayout.h"
+#include "CGSYCLRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
@@ -1846,9 +1847,21 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
       .Case("cuda", llvm::fp::FPAccuracy::CUDA);
 }
 
+static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
+  assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") ||
+         FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") ||
+         FPAccuracyStr.equals("cuda"));
+  return llvm::StringSwitch<int32_t>(FPAccuracyStr)
+      .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
+      .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
+      .Case("low", SYCLInternalAspect::fp_intrinsic_accuracy_low)
+      .Case("sycl", SYCLInternalAspect::fp_intrinsic_accuracy_sycl)
+      .Case("cuda", SYCLInternalAspect::fp_intrinsic_accuracy_cuda);
+}
+
 void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
-    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
-    const llvm::Type *FuncType) {
+    StringRef Name, llvm::AttrBuilder &FuncAttrs, llvm::Metadata *&MD,
+    unsigned ID, const llvm::Type *FuncType) {
   // Priority is given to to the accuracy specific to the function.
   // So, if the command line is something like this:
   // 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'.
@@ -1864,6 +1877,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(FuncMapIt->second));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(FuncMapIt->second)));
     }
   }
   if (FuncAttrs.attrs().size() == 0)
@@ -1872,6 +1887,8 @@ void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
           ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
       assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
       FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+      MD = llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+          Int32Ty, convertFPAccuracyToAspect(getLangOpts().FPAccuracyVal)));
     }
 }
 
diff --git a/clang/lib/CodeGen/CGSYCLRuntime.h b/clang/lib/CodeGen/CGSYCLRuntime.h
index 71bc45599516f..85a18e4bb590e 100644
--- a/clang/lib/CodeGen/CGSYCLRuntime.h
+++ b/clang/lib/CodeGen/CGSYCLRuntime.h
@@ -23,6 +23,19 @@ namespace CodeGen {
 
 class CodeGenModule;
 
+// These aspects are internal and used for device image splitting purposes only.
+// They are not exposed to the SYCL users through "aspect" enum. That's why
+// they are intentionally assigned negative values to filter them out at the
+// stage of embedding used aspects as device requirements to the executable.
+// We don't pass these internal aspects to the SYCL RT.
+enum SYCLInternalAspect : int32_t {
+  fp_intrinsic_accuracy_high = -1,
+  fp_intrinsic_accuracy_medium = -2,
+  fp_intrinsic_accuracy_low = -3,
+  fp_intrinsic_accuracy_sycl = -4,
+  fp_intrinsic_accuracy_cuda = -5,
+};
+
 class CGSYCLRuntime {
 protected:
   CodeGenModule &CGM;
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 475f8cf794424..9f1dc07343067 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7902,10 +7902,11 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
 
 void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
                                                 llvm::AttributeList &AttrList,
+                                                llvm::Metadata *&MD,
                                                 unsigned ID,
                                                 const llvm::Type *FuncType) {
   llvm::AttrBuilder FuncAttrs(getLLVMContext());
-  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, MD, ID, FuncType);
   AttrList = llvm::AttributeList::get(
       getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
 }
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 5d1521da2da63..56e9a9358ba3e 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1594,7 +1594,8 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   void getFPAccuracyFuncAttributes(StringRef Name,
-                                   llvm::AttributeList &AttrList, unsigned ID,
+                                   llvm::AttributeList &AttrList,
+                                   llvm::Metadata *&MDs, unsigned ID,
                                    const llvm::Type *FuncType);
 
 private:
@@ -1793,7 +1794,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
                                               llvm::AttrBuilder &FuncAttrs,
-                                              unsigned ID,
+                                              llvm::Metadata *&MD, unsigned ID,
                                               const llvm::Type *FuncType);
 
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
diff --git a/clang/test/CodeGenSYCL/fp-accuracy.cpp b/clang/test/CodeGenSYCL/fp-accuracy.cpp
new file mode 100644
index 0000000000000..322b7f8ac65a7
--- /dev/null
+++ b/clang/test/CodeGenSYCL/fp-accuracy.cpp
@@ -0,0 +1,127 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=low:tan -ffp-builtin-accuracy=cuda:exp,acos -ffp-builtin-accuracy=sycl:log,asin -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-FUNC %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=high -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-TU %s
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ffp-builtin-accuracy=medium -ffp-builtin-accuracy=high:sin,sqrt -ffp-builtin-accuracy=medium:cos -ffp-builtin-accuracy=cuda:exp -ffp-builtin-accuracy=sycl:log -emit-llvm -triple spir64-unknown-unknown %s -o - | FileCheck --check-prefix CHECK-MIX %s
+
+// Tests that sycl_used_aspects metadata is attached to the fpbuiltin call based on -ffp-accuracy option.
+
+#include "sycl.hpp"
+
+extern "C" SYCL_EXTERNAL double sin(double);
+extern "C" SYCL_EXTERNAL double cos(double);
+extern "C" SYCL_EXTERNAL double tan(double);
+extern "C" SYCL_EXTERNAL double log(double);
+extern "C" SYCL_EXTERNAL double exp(double);
+extern "C" SYCL_EXTERNAL double acos(double);
+extern "C" SYCL_EXTERNAL double asin(double);
+extern "C" SYCL_EXTERNAL double sqrt(double);
+
+using namespace sycl;
+
+int main() {
+  const unsigned array_size = 4;
+  double Value = .5;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel0>(numOfItems,
+    [=](id<1> wiID) {
+      (void)Value;
+    });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel1>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC:[0-9]+]]
+      (void)sin(Value);
+    });
+  });
+
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel2>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC:[0-9]+]]
+      (void)cos(Value);
+    });
+  });
+
+  // Kernel3 uses low-accuracy tan.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel3>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[LOW_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)tan(Value);
+    });
+  });
+
+  // Kernel4 uses cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel4>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-FUNC: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
+// CHECK-TU: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC:[0-9]+]]
+// CHECK-MIX: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC:[0-9]+]]
+      (void)log(exp(Value));
+    });
+  });
+  deviceQueue.wait();
+
+  // Kernel5 uses cuda-accuracy acos.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel5>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[CUDA_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)acos(Value);
+    });
+  });
+
+  // Kernel6 uses sycl-accuracy asin.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel6>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[SYCL_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[MEDIUM_ACC]]
+      (void)asin(Value);
+    });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler& cgh) {
+    cgh.parallel_for<class Kernel7>(numOfItems,
+    [=](id<1> wiID) {
+// CHECK-FUNC: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-TU: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+// CHECK-MIX: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR:[0-9]+]], !sycl_used_aspects ![[HIGH_ACC]]
+      (void)sqrt(Value);
+    });
+  });
+  return 0;
+}
+
+// CHECK-FUNC: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-FUNC: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-FUNC: [[LOW_ACC]] = !{i32 -3}
+// CHECK-FUNC: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-FUNC: [[SYCL_ACC]] = !{i32 -4}
+
+// CHECK-TU: [[HIGH_ACC]] = !{i32 -1}
+
+// CHECK-MIX: [[HIGH_ACC]] = !{i32 -1}
+// CHECK-MIX: [[MEDIUM_ACC]] = !{i32 -2}
+// CHECK-MIX: [[CUDA_ACC]] = !{i32 -5}
+// CHECK-MIX: [[SYCL_ACC]] = !{i32 -4}
diff --git a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
index e846b9141d63c..4b37c267f2353 100644
--- a/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLPropagateAspectsUsage.cpp
@@ -255,6 +255,13 @@ AspectsSetTy getAspectsUsedByInstruction(const Instruction &I,
     Result.insert(Aspects.begin(), Aspects.end());
   }
 
+  if (const MDNode *InstApsects = I.getMetadata("sycl_used_aspects")) {
+    for (const MDOperand &MDOp : InstApsects->operands()) {
+      const Constant *C = cast<ConstantAsMetadata>(MDOp)->getValue();
+      Result.insert(cast<ConstantInt>(C)->getSExtValue());
+    }
+  }
+
   return Result;
 }
 
diff --git a/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
new file mode 100644
index 0000000000000..47df6a804eabb
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/PropagateAspectsUsage/call-graph-inst.ll
@@ -0,0 +1,66 @@
+; RUN: opt -passes=sycl-propagate-aspects-usage < %s -S | FileCheck %s
+;
+; Test checks that the pass is able to propagate information about aspects
+; used in the instruction through a call graph
+;
+;   K1  K2
+;  /  \/  \
+; F1  F2   F3
+;
+; F1 doesn't use optional type and doesn't have instruction with attached 'sycl_used_aspects' metadata.
+; F2 uses optional A and has instruction with attached 'sycl_used_aspects' metadata.
+; F3 uses optional B and has instruction with attached 'sycl_used_aspects' metadata.
+
+%Optional.A = type { i32 }
+%Optional.B = type { i32 }
+
+; CHECK: spir_kernel void @kernel1() !sycl_used_aspects ![[#ID1:]]
+define spir_kernel void @kernel1() {
+  call spir_func void @func1()
+  call spir_func void @func2()
+  ret void
+}
+
+; CHECK: spir_kernel void @kernel2() !sycl_used_aspects ![[#ID2:]]
+define spir_kernel void @kernel2() {
+  call spir_func void @func2()
+  call spir_func void @func3()
+  ret void
+}
+
+; CHECK: spir_func void @func1() {
+define spir_func void @func1() {
+  %tmp = alloca i32
+  ret void
+}
+
+declare void @llvm.fpbuiltin.f64()
+
+; CHECK: spir_func void @func2() !sycl_used_aspects ![[#ID1]] {
+define spir_func void @func2() {
+  %tmp1 = alloca %Optional.A
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !3
+  ret void
+}
+
+; CHECK: spir_func void @func3() !sycl_used_aspects ![[#ID3:]] {
+define spir_func void @func3() {
+  %tmp = alloca %Optional.B
+  call void @llvm.fpbuiltin.f64(), !sycl_used_aspects !4
+  ret void
+}
+
+!sycl_types_that_use_aspects = !{!0, !1}
+!0 = !{!"Optional.A", i32 1}
+!1 = !{!"Optional.B", i32 2}
+
+!sycl_aspects = !{!2}
+!2 = !{!"fp64", i32 6}
+!3 = !{i32 -1}
+!4 = !{i32 -2}
+
+; CHECK: ![[#ID1]] = !{i32 1, i32 -1}
+; CHECK: ![[#ID2]] = !{i32 1, i32 -1, i32 2, i32 -2}
+; CHECK: ![[#ID3]] = !{i32 2, i32 -2}
+
+
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
index 4aa28dc4ff643..a9c791877a079 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -22,10 +22,10 @@ void llvm::getSYCLDeviceRequirements(
     const module_split::ModuleDesc &MD,
     std::map<StringRef, util::PropertyValue> &Requirements) {
   auto ExtractIntegerFromMDNodeOperand = [=](const MDNode *N,
-                                             unsigned OpNo) -> unsigned {
+                                             unsigned OpNo) -> int32_t {
     Constant *C =
         cast<ConstantAsMetadata>(N->getOperand(OpNo).get())->getValue();
-    return static_cast<uint32_t>(C->getUniqueInteger().getZExtValue());
+    return static_cast<int32_t>(C->getUniqueInteger().getSExtValue());
   };
 
   // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
@@ -41,10 +41,16 @@ void llvm::getSYCLDeviceRequirements(
     std::set<uint32_t> Values;
     for (const Function &F : MD.getModule()) {
       if (const MDNode *MDN = F.getMetadata(MDName)) {
-        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I)
-          Values.insert(ExtractIntegerFromMDNodeOperand(MDN, I));
+        for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+          // Don't put internal aspects (with negative integer value) into the
+          // requirements, they are used only for device image splitting.
+          auto Val = ExtractIntegerFromMDNodeOperand(MDN, I);
+          if (Val >= 0)
+            Values.insert(Val);
+        }
       }
     }
+
     // We don't need the "fixed_target" property if it's empty
     if (std::string(MDName) == "sycl_fixed_targets" && Values.empty())
       continue;
@@ -64,10 +70,11 @@ void llvm::getSYCLDeviceRequirements(
     if (auto *MDN = F->getMetadata("intel_reqd_sub_group_size")) {
       assert(MDN->getNumOperands() == 1);
       auto MDValue = ExtractIntegerFromMDNodeOperand(MDN, 0);
+      assert(MDValue >= 0);
       if (!SubGroupSize)
         SubGroupSize = MDValue;
       else
-        assert(*SubGroupSize == MDValue);
+        assert(*SubGroupSize == static_cast<uint32_t>(MDValue));
     }
   }
   // Do not attach reqd_sub_group_size if there is no attached metadata
diff --git a/sycl/doc/design/OptionalDeviceFeatures.md b/sycl/doc/design/OptionalDeviceFeatures.md
index 3bb601abb12c6..c3a1202d4ec6c 100644
--- a/sycl/doc/design/OptionalDeviceFeatures.md
+++ b/sycl/doc/design/OptionalDeviceFeatures.md
@@ -553,7 +553,8 @@ type because the front-end does not include that type in the
 `!sycl_types_that_use_aspects` set.  If a function references the `double`
 type, the implementation implicitly assumes that the function uses
 `aspect::fp64` and adds that aspect to the function's `!sycl_used_aspects`
-set.
+set. If `!sycl_used_aspects` is attached to instruction then it is also added
+to the function's `!sycl_used_aspects` set.
 
 **NOTE**: This scan of the IR will require comparing the type referenced by
 each IR instruction with the names of the types in the
@@ -1148,6 +1149,24 @@ Kernel has a required sub-group size of '32' but device does not support this
 sub-group size.
 ```
 
+### SYCL internal aspects for device image splitting
+
+There are scenarios when we would like to split device images based on
+optional kernel features but we don't want to expose corresponding
+aspects to the user. Internal SYCL aspects are used for this purpose.
+
+To differentiate them from regular aspects, internal aspects are assigned
+negative values. If optional feature is used in the kernel then SYCL
+device compiler adds value of internal aspect to 'sycl_used_aspects' metadata,
+it gets propagated through the call graph and participates in device image
+splitting together with regular aspects but it's not passed to the SYCL runtime,
+it is filtered out when generating a set of device requirements.
+
+New value can be added to 'SYCLInternalAspect' enum to introduce new internal
+aspect.
+
+Example of internal aspects usage is splitting device images based on floating
+point accuracy level for math functions provided by user using -ffp-accuracy option.
 
 ## Appendix: Adding an attribute to 8-byte `atomic_ref`
 
diff --git a/sycl/test/optional_kernel_features/fp-accuracy.cpp b/sycl/test/optional_kernel_features/fp-accuracy.cpp
new file mode 100644
index 0000000000000..80acc2baa893f
--- /dev/null
+++ b/sycl/test/optional_kernel_features/fp-accuracy.cpp
@@ -0,0 +1,226 @@
+// Tests that kernels which use different fp-accuracy level end up in different
+// device images.
+
+// 1. Accuracy is specified for particular math functions.
+// RUN: %clangxx %s -o %test_func.bc -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=low:tan -ffp-accuracy=cuda:exp,acos -ffp-accuracy=sycl:log,asin  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_func.bc -o %test_func.table
+// RUN: FileCheck %s -input-file=%test_func.table --check-prefixes CHECK-FUNC-TABLE
+// RUN: FileCheck %s -input-file=%test_func_0.sym --check-prefixes CHECK-FUNC-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_func_1.sym --check-prefixes CHECK-FUNC-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_func_2.sym --check-prefixes CHECK-FUNC-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_func_3.sym --check-prefixes CHECK-FUNC-M3-SYMS
+// RUN: FileCheck %s -input-file=%test_func_4.sym --check-prefixes CHECK-FUNC-M4-SYMS
+// RUN: FileCheck %s -input-file=%test_func_5.sym --check-prefixes CHECK-FUNC-M5-SYMS
+
+// 2. Accuracy is specified for TU.
+// RUN: %clangxx %s -o %test_tu.bc -ffp-accuracy=high -fno-math-errno -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_tu.bc -o %test_tu.table
+// RUN: FileCheck %s -input-file=%test_tu.table --check-prefixes CHECK-TU-TABLE
+// RUN: FileCheck %s -input-file=%test_tu_0.sym --check-prefixes CHECK-TU-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_tu_1.sym --check-prefixes CHECK-TU-M1-SYMS
+
+// 3. Mixed case.
+// RUN: %clangxx %s -o %test_mix.bc -ffp-accuracy=medium -ffp-accuracy=high:sin,sqrt -ffp-accuracy=medium:cos -ffp-accuracy=cuda:exp -ffp-accuracy=sycl:log  -fno-math-errno  -fsycl -fsycl-device-only
+// RUN: sycl-post-link -split=auto -symbols %test_mix.bc -o %test_mix.table
+// RUN: FileCheck %s -input-file=%test_mix.table --check-prefixes CHECK-MIX-TABLE
+// RUN: FileCheck %s -input-file=%test_mix_0.sym --check-prefixes CHECK-MIX-M0-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_1.sym --check-prefixes CHECK-MIX-M1-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_2.sym --check-prefixes CHECK-MIX-M2-SYMS
+// RUN: FileCheck %s -input-file=%test_mix_3.sym --check-prefixes CHECK-MIX-M3-SYMS
+
+// CHECK-FUNC-TABLE: Code
+// CHECK-FUNC-TABLE-NEXT: _0.sym
+// CHECK-FUNC-TABLE-NEXT: _1.sym
+// CHECK-FUNC-TABLE-NEXT: _2.sym
+// CHECK-FUNC-TABLE-NEXT: _3.sym
+// CHECK-FUNC-TABLE-NEXT: _4.sym
+// CHECK-FUNC-TABLE-NEXT: _5.sym
+// CHECK-FUNC-TABLE-NEXT: _6.sym
+// CHECK-FUNC-TABLE-EMPTY:
+
+// CHECK-TU-TABLE: Code
+// CHECK-TU-TABLE-NEXT: _0.sym
+// CHECK-TU-TABLE-NEXT: _1.sym
+// CHECK-TU-TABLE-EMPTY:
+
+// CHECK-MIX-TABLE: Code
+// CHECK-MIX-TABLE-NEXT: _0.sym
+// CHECK-MIX-TABLE-NEXT: _1.sym
+// CHECK-MIX-TABLE-NEXT: _2.sym
+// CHECK-MIX-TABLE-NEXT: _3.sym
+// CHECK-MIX-TABLE-EMPTY:
+
+// CHECK-FUNC-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-FUNC-M0-SYMS-NEXT: Kernel1
+// CHECK-FUNC-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-FUNC-M0-SYMS-NEXT: Kernel7
+// CHECK-FUNC-M0-SYMS-EMPTY:
+
+// CHECK-FUNC-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-FUNC-M1-SYMS-NEXT: Kernel2
+// CHECK-FUNC-M1-SYMS-EMPTY:
+
+// CHECK-FUNC-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-FUNC-M2-SYMS-NEXT: Kernel3
+// CHECK-FUNC-M2-SYMS-EMPTY:
+
+// CHECK-FUNC-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-FUNC-M3-SYMS-NEXT: Kernel6
+// CHECK-FUNC-M3-SYMS-EMPTY:
+
+// CHECK-FUNC-M4-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-FUNC-M4-SYMS-NEXT: Kernel4
+// CHECK-FUNC-M4-SYMS-EMPTY:
+
+// CHECK-FUNC-M5-SYMS: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-FUNC-M5-SYMS-NEXT: Kernel5
+// CHECK-FUNC-M5-SYMS-EMPTY:
+
+// CHECK-FUNC-M6-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-FUNC-M6-SYMS-NEXT: Kernel0
+// CHECK-FUNC-M6-SYMS-EMPTY:
+
+// CHECK-TU-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-TU-M0-SYMS-NEXT: Kernel1
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-TU-M0-SYMS-NEXT: Kernel2
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-TU-M0-SYMS-NEXT: Kernel3
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-TU-M0-SYMS-NEXT: Kernel4
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-TU-M0-SYMS-NEXT: Kernel5
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-TU-M0-SYMS-NEXT: Kernel6
+// CHECK-TU-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-TU-M0-SYMS-NEXT: Kernel7
+// CHECK-TU-M6-SYMS-EMPTY:
+
+// CHECK-TU-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-TU-M1-SYMS-NEXT: Kernel0
+// CHECK-TU-M1-SYMS-EMPTY:
+
+// CHECK-MIX-M0-SYMS: __pf_kernel_wrapper{{.*}}Kernel1
+// CHECK-MIX-M0-SYMS-NEXT: Kernel1
+// CHECK-MIX-M0-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel7
+// CHECK-MIX-M0-SYMS-NEXT: Kernel7
+// CHECK-MIX-M0-SYMS-EMPTY:
+
+// CHECK-MIX-M1-SYMS: __pf_kernel_wrapper{{.*}}Kernel2
+// CHECK-MIX-M1-SYMS-NEXT: Kernel2
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel3
+// CHECK-MIX-M1-SYMS-NEXT: Kernel3
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel5
+// CHECK-MIX-M1-SYMS-NEXT: Kernel5
+// CHECK-MIX-M1-SYMS-NEXT: __pf_kernel_wrapper{{.*}}Kernel6
+// CHECK-MIX-M1-SYMS-NEXT: Kernel6
+// CHECK-MIX-M1-SYMS-EMPTY:
+
+// CHECK-MIX-M2-SYMS: __pf_kernel_wrapper{{.*}}Kernel4
+// CHECK-MIX-M2-SYMS-NEXT: Kernel4
+// CHECK-MIX-M2-SYMS-EMPTY:
+
+// CHECK-MIX-M3-SYMS: __pf_kernel_wrapper{{.*}}Kernel0
+// CHECK-MIX-M3-SYMS-NEXT: Kernel0
+// CHECK-MIX-M3-SYMS-EMPTY:
+
+#include <array>
+#include <cmath>
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+constexpr access::mode sycl_read = access::mode::read;
+constexpr access::mode sycl_write = access::mode::write;
+
+int main() {
+  const size_t array_size = 4;
+  std::array<double, array_size> D = {{1., 2., 3., 4.}}, E;
+  queue deviceQueue;
+  range<1> numOfItems{array_size};
+  double Value = 5.;
+  buffer<double, 1> bufferOut(E.data(), numOfItems);
+
+  // Kernel0 doesn't use math functions.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel0>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = Value; });
+  });
+
+  // Kernel1 uses high-accuracy sin.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel1>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sin(Value); });
+  });
+
+  // Kernel2 uses:
+  // 1. medium-accuracy cos
+  // 2. high-accuracy cos
+  // 3. medium-accuracy cos
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel2>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::cos(Value); });
+  });
+
+  // Kernel3 uses:
+  // 1. low-accuracy tan
+  // 2. high-accuracy tan
+  // 3. medium-accuracy tan.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel3>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::tan(Value); });
+  });
+
+  // Kernel4 uses:
+  // 1. cuda-accuracy exp and sycl-accuracy log.
+  // 2. high-accuracy exp and high-accuracy log.
+  // 3. cuda-accuracy exp and sycl-accuracy log.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel4>(numOfItems, [=](id<1> wiID) {
+      accessorOut[wiID] = std::log(std::exp(Value));
+    });
+  });
+
+  // Kernel5 uses:
+  // 1. cuda-accuracy acos.
+  // 1. high-accuracy acos.
+  // 1. medium-accuracy acos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel5>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::acos(Value); });
+  });
+
+  // Kernel6 uses:
+  // 1. sycl-accuracy acos.
+  // 1. high-accuracy acos.
+  // 1. medium-accuracy acos.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel6>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::asin(Value); });
+  });
+
+  // Kernel7 uses high-accuracy sqrt.
+  deviceQueue.submit([&](handler &cgh) {
+    auto accessorOut = bufferOut.template get_access<sycl_write>(cgh);
+
+    cgh.parallel_for<class Kernel7>(
+        numOfItems, [=](id<1> wiID) { accessorOut[wiID] = std::sqrt(Value); });
+  });
+
+  return 0;
+}