intel · hdelan · Jan 25, 2022 · Jan 25, 2022 · Jan 25, 2022 · Jan 25, 2022
@@ -14146,6 +14146,22 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
     __spirv_ocl_fma(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
 #endif
 
+#ifdef cl_khr_fp16
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(__clc_fp16_t,
+                                                                 __clc_fp16_t,
+                                                                 __clc_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
+    __clc_fma_relu(__clc_vec2_fp16_t, __clc_vec2_fp16_t, __clc_vec2_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
+    __clc_fma_relu(__clc_vec3_fp16_t, __clc_vec3_fp16_t, __clc_vec3_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
+    __clc_fma_relu(__clc_vec4_fp16_t, __clc_vec4_fp16_t, __clc_vec4_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
+    __clc_fma_relu(__clc_vec8_fp16_t, __clc_vec8_fp16_t, __clc_vec8_fp16_t);
+_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
+    __clc_fma_relu(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
+#endif
+
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t
     __spirv_ocl_fmax(__clc_fp32_t, __clc_fp32_t);
 _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t

@@ -4540,6 +4540,42 @@ __spirv_ocl_fma(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
                          as_half16(args_2));
 }
 
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(
+    __clc_float16_t args_0, __clc_float16_t args_1, __clc_float16_t args_2) {
+  return __clc_fma_relu(as_half(args_0), as_half(args_1), as_half(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
+__clc_fma_relu(__clc_vec2_float16_t args_0, __clc_vec2_float16_t args_1,
+               __clc_vec2_float16_t args_2) {
+  return __clc_fma_relu(as_half2(args_0), as_half2(args_1), as_half2(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
+__clc_fma_relu(__clc_vec3_float16_t args_0, __clc_vec3_float16_t args_1,
+               __clc_vec3_float16_t args_2) {
+  return __clc_fma_relu(as_half3(args_0), as_half3(args_1), as_half3(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
+__clc_fma_relu(__clc_vec4_float16_t args_0, __clc_vec4_float16_t args_1,
+               __clc_vec4_float16_t args_2) {
+  return __clc_fma_relu(as_half4(args_0), as_half4(args_1), as_half4(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
+__clc_fma_relu(__clc_vec8_float16_t args_0, __clc_vec8_float16_t args_1,
+               __clc_vec8_float16_t args_2) {
+  return __clc_fma_relu(as_half8(args_0), as_half8(args_1), as_half8(args_2));
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
+__clc_fma_relu(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
+               __clc_vec16_float16_t args_2) {
+  return __clc_fma_relu(as_half16(args_0), as_half16(args_1),
+                        as_half16(args_2));
+}
+
 _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
 __spirv_ocl_fmax(__clc_float16_t args_0, __clc_float16_t args_1) {
   return __spirv_ocl_fmax(as_half(args_0), as_half(args_1));

@@ -0,0 +1,120 @@
+= sycl_ext_oneapi_fma_relu
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2022-2022 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+or contact hugh 'dot' delaney 'at' codeplay 'dot' com.
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 4 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+For the `bfloat16` cases this extension depends on the following other SYCL
+extensions:
+
+* link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_*_bf16_conversion]
+
+For the `half` cases this extension requires the runtime aspect 
+`sycl::aspect::fp16`.
+
+== Contributors
+
+* Hugh Delaney
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+[NOTE]
+====
+This extension is currently implemented in {dpcpp} only for GPU devices and
+only when using the CUDA backend.  Attempting to use this extension in
+kernels that run on other devices or backends may result in undefined behavior.
+Be aware that the compiler is not able to issue a diagnostic to warn you if
+this happens.
+====
+
+
+== Overview
+
+This extension introduces the `fma_relu` function for datatypes `sycl::half`,
+`bfloat16` and `bfloat16x2`. `bfloat16` and `bfloat16x2` refer to the bfloat16
+class from the `sycl_ext_*_bf16_conversion` extension, and currently use 
+`uint16_t` and `uint32_t`, respectively, as storage types. 
+
+```c++
+namespace sycl::ext::oneapi::experimental {
+
+// Available when T is sycl::half, uint16_t (bfloat16) or uint32_t (bfloat16x2)
+template <typename T>
+T fma_relu(T a, T b, T c);
+}
+```
+
+`fma_relu` returns `a * b + c > 0 ? a * b + c : 0`. 
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_FMA_RELU` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
+an implementation must additionally predefine the macro 
+`SYCL_EXT_INTEL_BF16_CONVERSION`, as detailed in 
+link:./sycl_ext_intel_bf16_conversion.asciidoc[
+  sycl_ext_*_bf16_conversion].
+
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
@@ -755,6 +755,57 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...);
 extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...);
 #endif
 
+extern SYCL_EXTERNAL _Float16 __clc_fma_relu(_Float16, _Float16, _Float16);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 2>, __ocl_vec_t<_Float16, 2>,
+                   __ocl_vec_t<_Float16, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 3>, __ocl_vec_t<_Float16, 3>,
+                   __ocl_vec_t<_Float16, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 4>, __ocl_vec_t<_Float16, 4>,
+                   __ocl_vec_t<_Float16, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 8>, __ocl_vec_t<_Float16, 8>,
+                   __ocl_vec_t<_Float16, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
+    __clc_fma_relu(__ocl_vec_t<_Float16, 16>, __ocl_vec_t<_Float16, 16>,
+                   __ocl_vec_t<_Float16, 16>);
+
+extern SYCL_EXTERNAL uint16_t __clc_fma_relu(uint16_t, uint16_t, uint16_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 2>, __ocl_vec_t<uint16_t, 2>,
+                   __ocl_vec_t<uint16_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 3>, __ocl_vec_t<uint16_t, 3>,
+                   __ocl_vec_t<uint16_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 4>, __ocl_vec_t<uint16_t, 4>,
+                   __ocl_vec_t<uint16_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 8>, __ocl_vec_t<uint16_t, 8>,
+                   __ocl_vec_t<uint16_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint16_t, 16>, __ocl_vec_t<uint16_t, 16>,
+                   __ocl_vec_t<uint16_t, 16>);
+
+extern SYCL_EXTERNAL uint32_t __clc_fma_relu(uint32_t, uint32_t, uint32_t);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 2>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 2>, __ocl_vec_t<uint32_t, 2>,
+                   __ocl_vec_t<uint32_t, 2>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 3>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 3>, __ocl_vec_t<uint32_t, 3>,
+                   __ocl_vec_t<uint32_t, 3>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 4>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 4>, __ocl_vec_t<uint32_t, 4>,
+                   __ocl_vec_t<uint32_t, 4>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 8>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 8>, __ocl_vec_t<uint32_t, 8>,
+                   __ocl_vec_t<uint32_t, 8>);
+extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 16>
+    __clc_fma_relu(__ocl_vec_t<uint32_t, 16>, __ocl_vec_t<uint32_t, 16>,
+                   __ocl_vec_t<uint32_t, 16>);
+
 #else // if !__SYCL_DEVICE_ONLY__
 
 template <typename dataT>

@@ -20,13 +20,15 @@
 #ifdef __SYCL_DEVICE_ONLY__
 #define __FUNC_PREFIX_OCL __spirv_ocl_
 #define __FUNC_PREFIX_CORE __spirv_
+#define __FUNC_PREFIX_GENERIC __clc_
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1)
 #define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)
 #define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3)
 #else
 #define __FUNC_PREFIX_OCL
 #define __FUNC_PREFIX_CORE
+#define __FUNC_PREFIX_GENERIC
 #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg)                              \
   extern Ret __SYCL_PPCAT(prefix, call)(Arg)
 #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)                         \
@@ -134,6 +136,7 @@ __SYCL_MAKE_CALL_ARG1(fabs, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fdim, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG1(floor, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG3(fma, __FUNC_PREFIX_OCL)
+__SYCL_MAKE_CALL_ARG3(fma_relu, __FUNC_PREFIX_GENERIC)
 __SYCL_MAKE_CALL_ARG2(fmax, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmin, __FUNC_PREFIX_OCL)
 __SYCL_MAKE_CALL_ARG2(fmod, __FUNC_PREFIX_OCL)

@@ -16,12 +16,25 @@
 #define __SYCL_CONSTANT_AS
 #endif
 
+// TODO Decide whether to mark functions with this attribute.
+#define __NOEXC /*noexcept*/
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace ext {
 namespace oneapi {
 namespace experimental {
 
+// fma_relu returns a * b + c > 0 ? a * b + c : 0
+template <typename T>
+sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
+                              sycl::detail::is_ugenshort<T>::value ||
+                              sycl::detail::is_ugenint<T>::value,
+                          T>
+fma_relu(T a, T b, T c) __NOEXC {
+  return __sycl_std::__invoke_fma_relu<T>(a, b, c);
+}
+
 // Provides functionality to print data from kernels in a C way:
 // - On non-host devices this function is directly mapped to printf from
 //   OpenCL C

@@ -359,6 +359,28 @@ MAKE_1V_2V_3V(fma, s::cl_float, s::cl_float, s::cl_float, s::cl_float)
 MAKE_1V_2V_3V(fma, s::cl_double, s::cl_double, s::cl_double, s::cl_double)
 MAKE_1V_2V_3V(fma, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
 
+// fma_relu
+__SYCL_EXPORT s::cl_half fma_relu(s::cl_half a, s::cl_half b,
+                                  s::cl_half c) __NOEXC {
+  auto ans = std::fma(a, b, c);
+  return (ans > 0) ? ans : 0;
+}
+__SYCL_EXPORT s::cl_ushort fma_relu(s::cl_ushort a, s::cl_ushort b,
+                                    s::cl_ushort c) __NOEXC {
+  // TODO implement this once bfloat16 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16 datatype.");
+}
+__SYCL_EXPORT s::cl_uint fma_relu(s::cl_uint a, s::cl_uint b,
+                                  s::cl_uint c) __NOEXC {
+  // TODO implement this once bfloat16x2 datatype is supported on host
+  throw std::runtime_error(
+      "fma_relu not supported on host for bfloat16x2 datatype.");
+}
+MAKE_1V_2V_3V(fma_relu, s::cl_ushort, s::cl_ushort, s::cl_ushort, s::cl_ushort)
+MAKE_1V_2V_3V(fma_relu, s::cl_uint, s::cl_uint, s::cl_uint, s::cl_uint)
+MAKE_1V_2V_3V(fma_relu, s::cl_half, s::cl_half, s::cl_half, s::cl_half)
+
 // fmax
 __SYCL_EXPORT s::cl_float fmax(s::cl_float x, s::cl_float y) __NOEXC {
   return std::fmax(x, y);