diff --git a/libclc/generic/include/spirv/spirv_builtins.h b/libclc/generic/include/spirv/spirv_builtins.h
index ca15fdf3c6547..fdeafa79818b9 100644
--- a/libclc/generic/include/spirv/spirv_builtins.h
+++ b/libclc/generic/include/spirv/spirv_builtins.h
@@ -14146,6 +14146,22 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
     __spirv_ocl_fma(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
 #endif
 
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(__clc_fp16_t,
+                                                                 __clc_fp16_t,
+                                                                 __clc_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
+    __clc_fma_relu(__clc_vec2_fp16_t, __clc_vec2_fp16_t, __clc_vec2_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
+    __clc_fma_relu(__clc_vec3_fp16_t, __clc_vec3_fp16_t, __clc_vec3_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
+    __clc_fma_relu(__clc_vec4_fp16_t, __clc_vec4_fp16_t, __clc_vec4_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
+    __clc_fma_relu(__clc_vec8_fp16_t, __clc_vec8_fp16_t, __clc_vec8_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
+    __clc_fma_relu(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
+#endif
+
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t
     __spirv_ocl_fmax(__clc_fp32_t, __clc_fp32_t);
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
diff --git a/libclc/generic/libspirv/float16.cl b/libclc/generic/libspirv/float16.cl
index 28f5b65ac80e2..f8a45db707fac 100644
--- a/libclc/generic/libspirv/float16.cl
+++ b/libclc/generic/libspirv/float16.cl
@@ -4570,6 +4570,42 @@ __spirv_ocl_fma(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
                          as_half16(args_2));
 }
 
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(
+    __clc_float16_t args_0, __clc_float16_t args_1, __clc_float16_t args_2) {
+  return __clc_fma_relu(as_half(args_0), as_half(args_1), as_half(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
+__clc_fma_relu(__clc_vec2_float16_t args_0, __clc_vec2_float16_t args_1,
+               __clc_vec2_float16_t args_2) {
+  return __clc_fma_relu(as_half2(args_0), as_half2(args_1), as_half2(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
+__clc_fma_relu(__clc_vec3_float16_t args_0, __clc_vec3_float16_t args_1,
+               __clc_vec3_float16_t args_2) {
+  return __clc_fma_relu(as_half3(args_0), as_half3(args_1), as_half3(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
+__clc_fma_relu(__clc_vec4_float16_t args_0, __clc_vec4_float16_t args_1,
+               __clc_vec4_float16_t args_2) {
+  return __clc_fma_relu(as_half4(args_0), as_half4(args_1), as_half4(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
+__clc_fma_relu(__clc_vec8_float16_t args_0, __clc_vec8_float16_t args_1,
+               __clc_vec8_float16_t args_2) {
+  return __clc_fma_relu(as_half8(args_0), as_half8(args_1), as_half8(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
+__clc_fma_relu(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
+               __clc_vec16_float16_t args_2) {
+  return __clc_fma_relu(as_half16(args_0), as_half16(args_1),
+                        as_half16(args_2));
+}
+
 _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
 __spirv_ocl_fmax(__clc_float16_t args_0, __clc_float16_t args_1) {
   return __spirv_ocl_fmax(as_half(args_0), as_half(args_1));
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
new file mode 100644
index 0000000000000..175219e23c47f
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc
@@ -0,0 +1,336 @@
+= sycl_ext_oneapi_bfloat16
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+// This is necessary for asciidoc, but not for asciidoctor
+:cpp: C++
+
+== Notice
+
+IMPORTANT: This specification is a draft.
+
+Copyright (c) 2021-2022 Intel Corporation. All rights reserved.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 specification, Revision 4.
+
+== Status
+
+Draft
+
+This is a preview extension specification, intended to provide early access to
+a feature for review and community feedback. When the feature matures, this
+specification may be released as a formal extension.
+
+Because the interfaces defined by this specification are not final and are
+subject to change they are not intended to be used by shipping software
+products.
+
+== Version
+
+Revision: 4
+
+== Introduction
+
+This extension adds functionality to convert value of single-precision
+floating-point type(`float`) to `bfloat16` type and vice versa. The extension
+doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer
+type(`uint16_t`) as a storage for `bfloat16` values.
+
+The purpose of conversion from float to bfloat16 is to reduce ammount of memory
+required to store floating-point numbers. Computations are expected to be done with
+32-bit floating-point values.
+
+This extension is an optional kernel feature as described in
+https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7]
+of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this
+feature to a device that does not support it should cause a synchronous
+`errc::kernel_not_supported` exception to be thrown from the kernel invocation
+command (e.g. from `parallel_for`).
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros". Therefore, an implementation
+supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table
+below. Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro’s
+ value to determine which of the extension’s APIs the implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== Extension to `enum class aspect`
+
+[source]
+----
+namespace sycl {
+enum class aspect {
+  ...
+  ext_oneapi_bfloat16
+}
+}
+----
+
+If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively
+supports conversion of values of `float` type to `bfloat16` and back.
+
+If the device doesn't have the aspect, objects of `bfloat16` class must not be
+used in the device code.
+
+**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported.  The
+`bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU.
+
+== New `bfloat16` class
+
+The `bfloat16` class below provides the conversion functionality. Conversion
+from `float` to `bfloat16` is done with round to nearest even(RTE) rounding
+mode.
+
+[source]
+----
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+class bfloat16 {
+  using storage_t = uint16_t;
+  storage_t value;
+
+public:
+  bfloat16() = default;
+  bfloat16(const bfloat16 &) = default;
+  ~bfloat16() = default;
+
+  // Explicit conversion functions
+  static storage_t from_float(const float &a);
+  static float to_float(const storage_t &a);
+
+  // Convert from float to bfloat16
+  bfloat16(const float &a);
+  bfloat16 &operator=(const float &a);
+
+  // Convert from bfloat16 to float
+  operator float() const;
+
+  // Get bfloat16 as uint16.
+  operator storage_t() const;
+
+  // Convert to bool type
+  explicit operator bool();
+
+  friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }
+
+  // OP is: prefix ++, --
+  friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }
+
+  // OP is: postfix ++, --
+  friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }
+
+  // OP is: +=, -=, *=, /=
+  friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is +, -, *, /
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+
+  // OP is ==,!=, <, >, <=, >=
+  friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }
+  template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+----
+
+Table 1. Member functions of `bfloat16` class.
+|===
+| Member Function | Description
+
+|  `static storage_t from_float(const float &a);`
+|  Explicitly convert from `float` to `bfloat16`.
+
+|  `static float to_float(const storage_t &a);`
+|  Interpret `a` as `bfloat16` and explicitly convert it to `float`.
+
+| `bfloat16(const float& a);`
+| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`.
+
+| `bfloat16 &operator=(const float &a);`
+| Replace the value with `a` converted to `bfloat16`
+
+| `operator float() const;`
+|  Return `bfloat16` value converted to `float`.
+
+| `operator storage_t() const;`
+| Return `uint16_t` value, whose bits represent `bfloat16` value.
+
+| `explicit operator bool() { /* ... */ }`
+| Convert `bfloat16` to `bool` type. Return `false` if the value equals to
+  zero, return `true` otherwise.
+
+| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }`
+| Construct new instance of `bfloat16` class with negated value of the `bf`.
+
+| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }`
+| Perform an in-place `OP` prefix arithmetic operation on the `bf`,
+  assigning the result to the `bf` and return the `bf`.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }`
+| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning
+  the result to the `bf` and return a copy of `bf` before the operation is
+  performed.
+
+  OP is: `++, --`
+
+| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs`
+  and return the `lhs`.
+
+  OP is: `+=, -=, *=, /=`
+
+| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` and `rhs` `bfloat16` values.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `template <typename T>
+  friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Construct a new instance of the `bfloat16` class with the value of the new
+  `bfloat16` instance being the result of an OP arithmetic operation between
+  the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be
+  convertible to `float`.
+
+  OP is `+, -, *, /`
+
+| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16`
+  values and return the result as a boolean value.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of
+  template type `T` and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+
+| `template <typename T>
+  friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }`
+| Perform comparison operation OP between `lhs` of template type `T` and `rhs`
+  `bfloat16` value and return the result as a boolean value. Type `T` must be
+  convertible to `float`.
+
+OP is `==, !=, <, >, <=, >=`
+|===
+
+== Example
+
+[source]
+----
+#include <sycl/sycl.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
+using sycl::ext::oneapi::experimental::bfloat16;
+
+bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) {
+  return static_cast<float>(lhs) + static_cast<float>(rhs);
+}
+
+float foo(float a, float b) {
+  // Convert from float to bfloat16.
+  bfloat16 A {a};
+  bfloat16 B {b};
+
+  // Convert A and B from bfloat16 to float, do addition on floating-pointer
+  // numbers, then convert the result to bfloat16 and store it in C.
+  bfloat16 C = A + B;
+
+  // Return the result converted from bfloat16 to float.
+  return C;
+}
+
+int main (int argc, char *argv[]) {
+  float data[3] = {7.0, 8.1, 0.0};
+  sycl::device dev;
+  sycl::queue deviceQueue{dev};
+  sycl::buffer<float, 1> buf {data, sycl::range<1> {3}};
+
+  if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) {
+    deviceQueue.submit ([&] (sycl::handler& cgh) {
+      auto numbers = buf.get_access<sycl::access::mode::read_write> (cgh);
+      cgh.single_task<class simple_kernel> ([=] () {
+        numbers[2] = foo(numbers[0], numbers[1]);
+      });
+    });
+  }
+  return 0;
+}
+----
+
+== Issues
+
+None.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2021-08-02|Alexey Sotkin |Initial public working draft
+|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions +
+                             Add operator overloadings +
+                             Apply code review suggestions
+|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor
+|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi
+|========================================
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
new file mode 100644
index 0000000000000..1e79434a51815
--- /dev/null
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
@@ -0,0 +1,122 @@
+= sycl_ext_oneapi_fma_relu
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+or contact hugh 'dot' delaney 'at' codeplay 'dot' com.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 4 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+For the `bfloat16` cases this extension depends on the following other SYCL
+extensions:
+
+* link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_*_bf16_conversion]
+
+For the `half` cases this extension requires the runtime aspect 
+`sycl::aspect::fp16`.
+
+== Contributors
+
+* Hugh Delaney
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for GPU devices and
+only when using the CUDA backend.  Attempting to use this extension in
+kernels that run on other devices or backends may result in undefined behavior.
+Be aware that the compiler is not able to issue a diagnostic to warn you if
+this happens.
+====
+
+
+== Overview
+
+This extension introduces the `fma_relu` function for datatypes `sycl::half`,
+`bfloat16` and `bfloat16x2`. `bfloat16` refers to the bfloat16 class from 
+the `sycl_ext_oneapi_bfloat16` extension. `bfloat16x2` has not yet been 
+implemented as a class so this `fma_relu` extension uses `uint32_t` as a
+storage type for `bfloat16x2`. This will be changed once the `bfloat16x2` 
+class has been implemented. 
+
+== Specification
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+
+// Available when T is sycl::half, uint16_t (bfloat16) or uint32_t (bfloat16x2)
+template <typename T>
+T fma_relu(T a, T b, T c);
+}
+```
+
+`fma_relu` returns `a * b + c > 0 ? a * b + c : 0`. 
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_FMA_RELU` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
+an implementation must additionally predefine the macro 
+`SYCL_EXT_ONEAPI_BFLOAT16`, as detailed in 
+link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_oneapi_bfloat16].
+
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 3c6ce1c639960..2d0b2f0036cb3 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -755,6 +755,57 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...);
 extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...);
 #endif
 
+extern SYCL_EXTERNAL _Float16 __clc_fma_relu(_Float16, _Float16, _Float16);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 2>, __ocl_vec_t<_Float16, 2>,
+                   __ocl_vec_t<_Float16, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 3>, __ocl_vec_t<_Float16, 3>,
+                   __ocl_vec_t<_Float16, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 4>, __ocl_vec_t<_Float16, 4>,
+                   __ocl_vec_t<_Float16, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 8>, __ocl_vec_t<_Float16, 8>,
+                   __ocl_vec_t<_Float16, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 16>, __ocl_vec_t<_Float16, 16>,
+                   __ocl_vec_t<_Float16, 16>);
+
+extern SYCL_EXTERNAL uint16_t __clc_fma_relu(uint16_t, uint16_t, uint16_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 2>, __ocl_vec_t<uint16_t, 2>,
+                   __ocl_vec_t<uint16_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 3>, __ocl_vec_t<uint16_t, 3>,
+                   __ocl_vec_t<uint16_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 4>, __ocl_vec_t<uint16_t, 4>,
+                   __ocl_vec_t<uint16_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 8>, __ocl_vec_t<uint16_t, 8>,
+                   __ocl_vec_t<uint16_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 16>, __ocl_vec_t<uint16_t, 16>,
+                   __ocl_vec_t<uint16_t, 16>);
+
+extern SYCL_EXTERNAL uint32_t __clc_fma_relu(uint32_t, uint32_t, uint32_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 2>, __ocl_vec_t<uint32_t, 2>,
+                   __ocl_vec_t<uint32_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 3>, __ocl_vec_t<uint32_t, 3>,
+                   __ocl_vec_t<uint32_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 4>, __ocl_vec_t<uint32_t, 4>,
+                   __ocl_vec_t<uint32_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 8>, __ocl_vec_t<uint32_t, 8>,
+                   __ocl_vec_t<uint32_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 16>, __ocl_vec_t<uint32_t, 16>,
+                   __ocl_vec_t<uint32_t, 16>);
+
 // Native builtin extension
 
 extern SYCL_EXTERNAL float __clc_native_tanh(float);
diff --git a/sycl/include/CL/sycl/detail/builtins.hpp b/sycl/include/CL/sycl/detail/builtins.hpp
index f4f0475ea905b..a6ca592bacb42 100644
--- a/sycl/include/CL/sycl/detail/builtins.hpp
+++ b/sycl/include/CL/sycl/detail/builtins.hpp
@@ -20,6 +20,7 @@
 #ifdef __SYCL_DEVICE_ONLY__
 #define __FUNC_PREFIX_OCL __spirv_ocl_
 #define __FUNC_PREFIX_CORE __spirv_
+#define __FUNC_PREFIX_GENERIC __clc_
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1)
 #define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)
@@ -27,6 +28,7 @@
 #else
 #define __FUNC_PREFIX_OCL
 #define __FUNC_PREFIX_CORE
+#define __FUNC_PREFIX_GENERIC
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg)                              \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)                         \
@@ -134,6 +136,7 @@ __SYCL_MAKE_CALL_ARG1(fabs, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fdim, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG1(floor, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG3(fma, __FUNC_PREFIX_OCL)
+__SYCL_MAKE_CALL_ARG3(fma_relu, __FUNC_PREFIX_GENERIC)
 __SYCL_MAKE_CALL_ARG2(fmax, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmin, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmod, __FUNC_PREFIX_OCL)
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index 6c58b98d29f1f..516464e1b9fe3 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -55,7 +55,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_SUB_GROUP 1
 #define SYCL_EXT_ONEAPI_PROPERTIES 1
 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1
-#define SYCL_EXT_INTEL_BF16_CONVERSION 1
+#define SYCL_EXT_ONEAPI_BF16_CONVERSION 1
 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1
 #ifdef __clang__
 #if __has_extension(sycl_extended_atomics)
diff --git a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
similarity index 90%
rename from sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
rename to sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
index 5a51f3746e225..1190c80631928 100644
--- a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp
@@ -14,10 +14,10 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
-namespace intel {
+namespace oneapi {
 namespace experimental {
 
-class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
+class bfloat16 {
   using storage_t = uint16_t;
   storage_t value;
 
@@ -29,7 +29,11 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   // Explicit conversion functions
   static storage_t from_float(const float &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return __nvvm_f2bf16_rn(a);
+#else
     return __spirv_ConvertFToBF16INTEL(a);
+#endif
 #else
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
@@ -37,7 +41,14 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
   }
   static float to_float(const storage_t &a) {
 #if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    uint32_t y = a;
+    y = y << 16;
+    float *res = reinterpret_cast<float *>(&y);
+    return *res;
+#else
     return __spirv_ConvertBF16ToFINTEL(a);
+#endif
 #else
     throw exception{errc::feature_not_supported,
                     "Bfloat16 conversion is not supported on host device"};
@@ -70,7 +81,16 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 
   // Unary minus operator overloading
   friend bfloat16 operator-(bfloat16 &lhs) {
-    return bfloat16{-to_float(lhs.value)};
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__NVPTX__)
+    return from_bits(__nvvm_neg_bf16(lhs.value));
+#else
+    return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)};
+#endif
+#else
+    throw exception{errc::feature_not_supported,
+                    "Bfloat16 unary minus is not supported on host device"};
+#endif
   }
 
 // Increment and decrement operators overloading
@@ -143,7 +163,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 {
 };
 
 } // namespace experimental
-} // namespace intel
+} // namespace oneapi
 } // namespace ext
 
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
index c8fa033d8c79e..5c272e4f8132b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
@@ -16,6 +16,8 @@
 
 #include <CL/__spirv/spirv_ops.hpp>
 
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
+
 // TODO Decide whether to mark functions with this attribute.
 #define __NOEXC /*noexcept*/
 
@@ -25,12 +27,30 @@
 #define __SYCL_CONSTANT_AS
 #endif
 
+// TODO Decide whether to mark functions with this attribute.
+#define __NOEXC /*noexcept*/
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
 namespace oneapi {
 namespace experimental {
 
+// fma_relu returns a * b + c > 0 ? a * b + c : 0
+template <typename T>
+sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
+                              sycl::detail::is_ugenint<T>::value ||
+                              std::is_same<T, bfloat16>::value,
+                          T>
+fma_relu(T a, T b, T c) __NOEXC {
+  if constexpr (std::is_same<T, bfloat16>::value) {
+    return bfloat16::from_bits(
+        __sycl_std::__invoke_fma_relu<uint16_t>(a.raw(), b.raw(), c.raw()));
+  } else {
+    return __sycl_std::__invoke_fma_relu<T>(a, b, c);
+  }
+}
+
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
 //   OpenCL C
@@ -40,9 +60,9 @@ namespace experimental {
 // Please refer to corresponding section in OpenCL C specification to find
 // information about format string and its differences from standard C rules.
 //
-// This function is placed under 'experimental' namespace on purpose, because it
-// has too much caveats you need to be aware of before using it. Please find
-// them below and read carefully before using it:
+// This function is placed under 'experimental' namespace on purpose, because
+// it has too much caveats you need to be aware of before using it. Please
+// find them below and read carefully before using it:
 //
 // - According to the OpenCL spec, the format string must be
 // resolvable at compile time i.e. cannot be dynamically created by the
@@ -52,19 +72,19 @@ namespace experimental {
 // address space. The constant address space declarations might get "tricky",
 // see test/built-ins/printf.cpp for examples.
 // In simple cases (compile-time known string contents, direct declaration of
-// the format literal inside the printf call, etc.), the compiler should handle
-// the automatic address space conversion.
+// the format literal inside the printf call, etc.), the compiler should
+// handle the automatic address space conversion.
 // FIXME: Once the extension to generic address space is fully supported, the
 // constant AS version may need to be deprecated.
 //
-// - The format string is interpreted according to the OpenCL C spec, where all
-// data types has fixed size, opposed to C++ types which doesn't guarantee
+// - The format string is interpreted according to the OpenCL C spec, where
+// all data types has fixed size, opposed to C++ types which doesn't guarantee
 // the exact width of particular data types (except, may be, char). This might
 // lead to unexpected result, for example: %ld in OpenCL C means that printed
-// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However,
-// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by
-// performing a cast, for example) that if you use %ld specifier, you pass
-// 64-bit argument to the cl::sycl::experimental::printf
+// argument has 'long' type which is 64-bit wide by the OpenCL C spec.
+// However, by C++ spec long is just at least 32-bit wide, so, you need to
+// ensure (by performing a cast, for example) that if you use %ld specifier,
+// you pass 64-bit argument to the cl::sycl::experimental::printf
 //
 // - OpenCL spec defines several additional features, like, for example, 'v'
 // modifier which allows to print OpenCL vectors: note that these features are
diff --git a/sycl/source/detail/builtins_math.cpp b/sycl/source/detail/builtins_math.cpp
index 5a78d6cb80a5b..9c7c914db0df1 100644
--- a/sycl/source/detail/builtins_math.cpp
+++ b/sycl/source/detail/builtins_math.cpp
@@ -359,6 +359,28 @@ MAKE_1V_2V_3V(fma, s::cl_float, s::cl_float, s::cl_float, s::cl_float)
 MAKE_1V_2V_3V(fma, s::cl_double, s::cl_double, s::cl_double, s::cl_double)
 MAKE_1V_2V_3V(fma, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
 
+// fma_relu
+__SYCL_EXPORT s::cl_half fma_relu(s::cl_half a, s::cl_half b,
+                                  s::cl_half c) __NOEXC {
+  auto ans = std::fma(a, b, c);
+  return (ans > 0) ? ans : 0;
+}
+__SYCL_EXPORT s::cl_ushort fma_relu(s::cl_ushort a, s::cl_ushort b,
+                                    s::cl_ushort c) __NOEXC {
+  // TODO implement this once bfloat16 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16 datatype.");
+}
+__SYCL_EXPORT s::cl_uint fma_relu(s::cl_uint a, s::cl_uint b,
+                                  s::cl_uint c) __NOEXC {
+  // TODO implement this once bfloat16x2 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16x2 datatype.");
+}
+MAKE_1V_2V_3V(fma_relu, s::cl_ushort, s::cl_ushort, s::cl_ushort, s::cl_ushort)
+MAKE_1V_2V_3V(fma_relu, s::cl_uint, s::cl_uint, s::cl_uint, s::cl_uint)
+MAKE_1V_2V_3V(fma_relu, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
+
 // fmax
 __SYCL_EXPORT s::cl_float fmax(s::cl_float x, s::cl_float y) __NOEXC {
   return std::fmax(x, y);
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 79d58722b4f4f..c2071e1df17a1 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -2932,6 +2932,27 @@ _ZN2cl10__host_std7u_rhaddEhh
 _ZN2cl10__host_std7u_rhaddEjj
 _ZN2cl10__host_std7u_rhaddEmm
 _ZN2cl10__host_std7u_rhaddEtt
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi3EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi8EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl6detail9half_impl4halfES4_S4_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi3EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi16EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi4EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi1EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi4EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi3EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi16EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi8EEES6_S6_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi2EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi1EEES3_S3_
+_ZN2cl10__host_std8fma_reluEttt
+_ZN2cl10__host_std8fma_reluEjjj
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi4EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi8EEES3_S3_
+_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi2EEES3_S3_
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEE
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEE
 _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEE
diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index 80696140b2dc8..847be9508bdb5 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -2,10 +2,10 @@
 
 // UNSUPPORTED: cuda || hip_amd
 
-#include <sycl/ext/intel/experimental/bfloat16.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
-using sycl::ext::intel::experimental::bfloat16;
+using sycl::ext::oneapi::experimental::bfloat16;
 
 SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y);
 SYCL_EXTERNAL void foo(long x, sycl::half y);