Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
025cf7e
Added bfloat16 support for cuda backend.
JackAKirk Jan 25, 2022
66b4e33
deleted intel namespace bfloat16.
JackAKirk Jan 25, 2022
2d04406
Format.
JackAKirk Jan 25, 2022
9418f74
Changed extension macro name.
JackAKirk Jan 25, 2022
65fddfa
Merge branch 'sycl' into bf16-cvt-ext
JackAKirk Feb 17, 2022
4d99f3f
fixed test.
JackAKirk Feb 17, 2022
3982001
Used neg ptx7.0 builtin for unary minus
JackAKirk Mar 4, 2022
450e1b5
Adding fma_relu extension
Mar 7, 2022
8d2d11f
Replaced SYCL_EXT_INTEL_BF16_CONVERSION.asciidoc with SYCL_EXT_ONEAPI…
JackAKirk Mar 7, 2022
a514505
Remove redundant include
Mar 7, 2022
d8bc53f
Merge branch 'sycl' into bf16-cvt-ext
JackAKirk Mar 8, 2022
37a18d7
Adding symbols to linux dump
Mar 11, 2022
a7b2fdc
Merge main into branch
Mar 11, 2022
7b40302
Responding to comments
hdelan Mar 14, 2022
2f9b7d7
Merge branch 'sycl' into bf16-cvt-ext
JackAKirk Mar 15, 2022
8a29c44
Renamed extension to cover all bfloat16 funct.
JackAKirk Mar 15, 2022
f53577f
Merge remote-tracking branch 'Jack/bf16-cvt-ext' into add_relu
Apr 4, 2022
49aca06
Making fma_relu accept the bfloat16 class
Apr 4, 2022
02cbc5b
Merge branch 'add_relu' of https://github.com/hdelan/llvm into add_relu
Apr 4, 2022
9fb55df
Update doc
Apr 4, 2022
358c943
Update sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asci…
hdelan Apr 4, 2022
7c6d728
Update sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
hdelan Apr 4, 2022
390ae97
Update sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
hdelan Apr 4, 2022
f08791a
Using bits instead of reinterpret cast
Apr 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions libclc/generic/include/spirv/spirv_builtins.h
Original file line number Diff line number Diff line change
Expand Up @@ -14146,6 +14146,22 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
__spirv_ocl_fma(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
#endif

#ifdef cl_khr_fp16
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(__clc_fp16_t,
__clc_fp16_t,
__clc_fp16_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t
__clc_fma_relu(__clc_vec2_fp16_t, __clc_vec2_fp16_t, __clc_vec2_fp16_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t
__clc_fma_relu(__clc_vec3_fp16_t, __clc_vec3_fp16_t, __clc_vec3_fp16_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t
__clc_fma_relu(__clc_vec4_fp16_t, __clc_vec4_fp16_t, __clc_vec4_fp16_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t
__clc_fma_relu(__clc_vec8_fp16_t, __clc_vec8_fp16_t, __clc_vec8_fp16_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t
__clc_fma_relu(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t);
#endif

_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t
__spirv_ocl_fmax(__clc_fp32_t, __clc_fp32_t);
_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t
Expand Down
36 changes: 36 additions & 0 deletions libclc/generic/libspirv/float16.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4540,6 +4540,42 @@ __spirv_ocl_fma(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
as_half16(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(
__clc_float16_t args_0, __clc_float16_t args_1, __clc_float16_t args_2) {
return __clc_fma_relu(as_half(args_0), as_half(args_1), as_half(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t
__clc_fma_relu(__clc_vec2_float16_t args_0, __clc_vec2_float16_t args_1,
__clc_vec2_float16_t args_2) {
return __clc_fma_relu(as_half2(args_0), as_half2(args_1), as_half2(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t
__clc_fma_relu(__clc_vec3_float16_t args_0, __clc_vec3_float16_t args_1,
__clc_vec3_float16_t args_2) {
return __clc_fma_relu(as_half3(args_0), as_half3(args_1), as_half3(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t
__clc_fma_relu(__clc_vec4_float16_t args_0, __clc_vec4_float16_t args_1,
__clc_vec4_float16_t args_2) {
return __clc_fma_relu(as_half4(args_0), as_half4(args_1), as_half4(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t
__clc_fma_relu(__clc_vec8_float16_t args_0, __clc_vec8_float16_t args_1,
__clc_vec8_float16_t args_2) {
return __clc_fma_relu(as_half8(args_0), as_half8(args_1), as_half8(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t
__clc_fma_relu(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1,
__clc_vec16_float16_t args_2) {
return __clc_fma_relu(as_half16(args_0), as_half16(args_1),
as_half16(args_2));
}

_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t
__spirv_ocl_fmax(__clc_float16_t args_0, __clc_float16_t args_1) {
return __spirv_ocl_fmax(as_half(args_0), as_half(args_1));
Expand Down
120 changes: 120 additions & 0 deletions sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
= sycl_ext_oneapi_fma_relu

:source-highlighter: coderay
:coderay-linenums-mode: table

// This section needs to be after the document title.
:doctype: book
:toc2:
:toc: left
:encoding: utf-8
:lang: en
:dpcpp: pass:[DPC++]

// Set the default source code type in this document to C++,
// for syntax highlighting purposes. This is needed because
// docbook uses c++ and html5 uses cpp.
:language: {basebackend@docbook:c++:cpp}


== Notice

[%hardbreaks]
Copyright (C) 2022-2022 Intel Corporation. All rights reserved.

Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by
permission by Khronos.

== Contact

To report problems with this extension, please open a new issue at:

https://github.com/intel/llvm/issues

or contact hugh 'dot' delaney 'at' codeplay 'dot' com.

== Dependencies

This extension is written against the SYCL 2020 revision 4 specification. All
references below to the "core SYCL specification" or to section numbers in the
SYCL specification refer to that revision.

For the `bfloat16` cases this extension depends on the following other SYCL
extensions:

* link:./sycl_ext_intel_bf16_conversion.asciidoc[
sycl_ext_*_bf16_conversion]

For the `half` cases this extension requires the runtime aspect
`sycl::aspect::fp16`.

== Contributors

* Hugh Delaney

== Status

This is a proposed extension specification, intended to gather community
feedback. Interfaces defined in this specification may not be implemented yet
or may be in a preliminary state. The specification itself may also change in
incompatible ways before it is finalized. *Shipping software products should
not rely on APIs defined in this specification.*

[NOTE]
====
This extension is currently implemented in {dpcpp} only for GPU devices and
only when using the CUDA backend. Attempting to use this extension in
kernels that run on other devices or backends may result in undefined behavior.
Be aware that the compiler is not able to issue a diagnostic to warn you if
this happens.
====


== Overview

This extension introduces the `fma_relu` function for datatypes `sycl::half`,
`bfloat16` and `bfloat16x2`. `bfloat16` and `bfloat16x2` refer to the bfloat16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this came up in another review also, but I forget which one. There is no bfloat16x2 type defined currently in sycl_ext_*_bf16_conversion (soon to be renamed sycl_ext_oneapi_bfloat16).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes have been made to the doc. fma_relu now accepts the bfloat16 class instead of uint16_t. The bfloat16x2 version still takes uint32_t as a storage type, but the doc explains that this will change once the bfloat16x2 class has been implemented as an extension.

class from the `sycl_ext_*_bf16_conversion` extension, and currently use
`uint16_t` and `uint32_t`, respectively, as storage types.

```c++
namespace sycl::ext::oneapi::experimental {

// Available when T is sycl::half, uint16_t (bfloat16) or uint32_t (bfloat16x2)
template <typename T>
T fma_relu(T a, T b, T c);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As part of extending math functions, you are already adding support for fma, fmax, etc to bfloat16/half variants.
What would be the benefit of adding a specific fma_relu over doing fma followed with fmax(res,0) and return 0 if the max is zero?

This extension of fma_relu is introducing two big "new" territories to DPC++:
1- Introducing ML activation functions to DPC++: the issue is that this type of functions are numerous: the ones we know of and the ones we don't know about them yet. Is the expectation to keep adding these as free functions in DPC++? relu is an easy one that can be written using max. What about the others? Why relu is so special here?

2- Introducing fusions to DPC++: fma_relu is telling the compiler these two functions can be fused together. While this can be important in libraries, is this really necessary for DPC++? DPC++ has a compiler that can detect that this type of relu or other functions is following an fma and can trigger the fusion the user intended.

One other open question and issue is: if we end up deciding to have this type of ML very specific functions in DPC++, what should be the objects that use them? scalar, vector ? marray? why the only vector type here is bfloat16x2 ? Should this be put under the joint matrix umbrella as an another potential tensor hardware accelerated function?

Copy link
Contributor Author

@hdelan hdelan Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are valid points.

The primary benefit of this sort of extension, is that it allows users to concisely target builtins specific to a particular backend. Since the fma_relu function is in the cuda math headers, we think that it is appropriate to have them in DPC++ as well, for ease of porting code etc. It is our feeling that since this extension targets just the CUDA backend, it will always be an extension and will not enter the core spec libraries. A DPC++ extension should (as much as possible) give users access to all of the functionality of the backend API, but not necessarily more. Therefore we do not need to be concerned about making fma_relu work for other backends (unless they also have a similar builtin to target).

The question of fusions is an interesting one, and something we will discuss a bit internally. Perhaps in the long run this is the approach that will be used in some instances.

The objects that use the function should be scalar and vector. The reason that bfloat16 has not been vectorized is because the vector types for the bfloat16 class has not been implemented yet. Once implemented we will add the bfloat vec versions for this function. bfloat16x2 is vectorized since we are relying on an older impl of bf16x2 which uses uint32_t as storage type.

However, we think that for the time being, we are interested in representing backend-specific features in DPC++, and since these features are exposed to the user as a free function in the CUDA headers, we think this is reason enough to bring this function into DPC++ as an extension.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you share a link to the cuda math headers that contains the full list of math/ML functions?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't find a link to the headers online, but you can find __hfma_relu in any regular install of CUDA 11.6

/usr/local/cuda-11.6 $ grep "fma_relu" * -HnrI
include/cuda_bf16.h:3216:__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
include/cuda_bf16.hpp:2142:__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
include/cuda_fp16.hpp:2453:__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
include/cuda_fp16.h:3251:__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
targets/x86_64-linux/include/cuda_bf16.h:3216:__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
targets/x86_64-linux/include/cuda_bf16.hpp:2142:__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
targets/x86_64-linux/include/cuda_fp16.hpp:2453:__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
targets/x86_64-linux/include/cuda_fp16.h:3251:__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);

Copy link
Contributor Author

@hdelan hdelan Apr 7, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think the approach should be with these functions?

Should we:

  1. Implement bfloat16 versions of the core sycl math functions. In the meantime we could make sure that when for instance exp(my_bf16) is being called, it is being cast to a float to ensure correctness, before the bfloat16 specialization of exp is fully implemented. The problem is that there are a lot of core math functions, and maybe there wouldn't be a clear distinction between those that have a native bf16 version, and those that rely on casting. For functions that are not in the core sycl math library, new ones could be added as extensions, as is the case for fma_relu.
  2. Do what CUDA does and make new free functions specifically catering to bf16 like hexp for instance (cuda uses same funcs for bf16 and half). This involves introducing more and more functions and the list is likely to get longer.
  3. Don't try to add support for these bf16 functions.

It is worth noting that not all the functions listed above have their own builtins, but it seems that all of them produce far less ptx than their say float implementation counterpart, so it would be worthwhile calling these special bf16 functions in some way.

The reason we have added fma_relu is so that users can target the PTX builtin relating to fma_relu. We did this relatively blindly because we thought it was a good idea to have access to all PTX builtins, which we still consider correct.

}
```
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it make more sense for this function to take the bfloat16 or bfloat16x2 types themselves rather than uint16_t and uint32_t?

Also a nit about the organization of this spec ... the "Specification" section below is the formal specification of your extension. The description of the fma_relu function should be there, not in the "Overview" section.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it make more sense for this function to take the bfloat16 or bfloat16x2 types themselves rather than uint16_t and uint32_t?

I am following the convention used by all of these bfloat16 PRs: #5748 #5724, which use uint16_t and uint32_t as storage types. Perhaps this mention of storage types doesn't belong in this document. Should I remove it?

The description of the fma_relu function should be there, not in the "Overview" section.

Thanks, have swapped that into specification section.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I talked with the @dkhaldi about the Matrix API, and she says they will add APIs that take the bfloat16 type soon, but they will keep the uint16_t versions also for a transition period. Does it make sense to add bfloat16 versions of fma_relu to this PR, or will you do that in a subsequent one?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I talked with the @dkhaldi about the Matrix API, and she says they will add APIs that take the bfloat16 type soon, but they will keep the uint16_t versions also for a transition period. Does it make sense to add bfloat16 versions of fma_relu to this PR, or will you do that in a subsequent one?

Good point, cc @hdelan, we should be able to add bfloat16 implementations of the fma_relu functions in this PR provided that #5393 is merged. We do want the bfloat16x2 cases too but this will require the definition of a bfloat16x2 class / extension doc update first, analogous to bfloat16 in #5393, so the corresponding bfloat16x2 impls will probably be done in a separate PR to this. For the joint_matrix API and other bfloat16 math builtins: fabs, fma, fmin, fmax, the uint16_t implementations are already merged and we are already working on follow up PRs for the corresponding bfloat16 implementations.


`fma_relu` returns `a * b + c > 0 ? a * b + c : 0`.

== Specification

=== Feature test macro

This extension provides a feature-test macro as described in the core SYCL
specification. An implementation supporting this extension must predefine the
macro `SYCL_EXT_ONEAPI_FMA_RELU` to one of the values defined in the table
below. Applications can test for the existence of this macro to determine if
the implementation supports this feature, or applications can test the macro's
value to determine which of the extension's features the implementation
supports.

If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then
an implementation must additionally predefine the macro
`SYCL_EXT_INTEL_BF16_CONVERSION`, as detailed in
link:./sycl_ext_intel_bf16_conversion.asciidoc[
sycl_ext_*_bf16_conversion].


[%header,cols="1,5"]
|===
|Value
|Description

|1
|The APIs of this experimental extension are not versioned, so the
feature-test macro always has this value.
|===

51 changes: 51 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,57 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...);
extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...);
#endif

extern SYCL_EXTERNAL _Float16 __clc_fma_relu(_Float16, _Float16, _Float16);
extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2>
__clc_fma_relu(__ocl_vec_t<_Float16, 2>, __ocl_vec_t<_Float16, 2>,
__ocl_vec_t<_Float16, 2>);
extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3>
__clc_fma_relu(__ocl_vec_t<_Float16, 3>, __ocl_vec_t<_Float16, 3>,
__ocl_vec_t<_Float16, 3>);
extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4>
__clc_fma_relu(__ocl_vec_t<_Float16, 4>, __ocl_vec_t<_Float16, 4>,
__ocl_vec_t<_Float16, 4>);
extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8>
__clc_fma_relu(__ocl_vec_t<_Float16, 8>, __ocl_vec_t<_Float16, 8>,
__ocl_vec_t<_Float16, 8>);
extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16>
__clc_fma_relu(__ocl_vec_t<_Float16, 16>, __ocl_vec_t<_Float16, 16>,
__ocl_vec_t<_Float16, 16>);

extern SYCL_EXTERNAL uint16_t __clc_fma_relu(uint16_t, uint16_t, uint16_t);
extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 2>
__clc_fma_relu(__ocl_vec_t<uint16_t, 2>, __ocl_vec_t<uint16_t, 2>,
__ocl_vec_t<uint16_t, 2>);
extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 3>
__clc_fma_relu(__ocl_vec_t<uint16_t, 3>, __ocl_vec_t<uint16_t, 3>,
__ocl_vec_t<uint16_t, 3>);
extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 4>
__clc_fma_relu(__ocl_vec_t<uint16_t, 4>, __ocl_vec_t<uint16_t, 4>,
__ocl_vec_t<uint16_t, 4>);
extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 8>
__clc_fma_relu(__ocl_vec_t<uint16_t, 8>, __ocl_vec_t<uint16_t, 8>,
__ocl_vec_t<uint16_t, 8>);
extern SYCL_EXTERNAL __ocl_vec_t<uint16_t, 16>
__clc_fma_relu(__ocl_vec_t<uint16_t, 16>, __ocl_vec_t<uint16_t, 16>,
__ocl_vec_t<uint16_t, 16>);

extern SYCL_EXTERNAL uint32_t __clc_fma_relu(uint32_t, uint32_t, uint32_t);
extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 2>
__clc_fma_relu(__ocl_vec_t<uint32_t, 2>, __ocl_vec_t<uint32_t, 2>,
__ocl_vec_t<uint32_t, 2>);
extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 3>
__clc_fma_relu(__ocl_vec_t<uint32_t, 3>, __ocl_vec_t<uint32_t, 3>,
__ocl_vec_t<uint32_t, 3>);
extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 4>
__clc_fma_relu(__ocl_vec_t<uint32_t, 4>, __ocl_vec_t<uint32_t, 4>,
__ocl_vec_t<uint32_t, 4>);
extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 8>
__clc_fma_relu(__ocl_vec_t<uint32_t, 8>, __ocl_vec_t<uint32_t, 8>,
__ocl_vec_t<uint32_t, 8>);
extern SYCL_EXTERNAL __ocl_vec_t<uint32_t, 16>
__clc_fma_relu(__ocl_vec_t<uint32_t, 16>, __ocl_vec_t<uint32_t, 16>,
__ocl_vec_t<uint32_t, 16>);

#else // if !__SYCL_DEVICE_ONLY__

template <typename dataT>
Expand Down
3 changes: 3 additions & 0 deletions sycl/include/CL/sycl/detail/builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@
#ifdef __SYCL_DEVICE_ONLY__
#define __FUNC_PREFIX_OCL __spirv_ocl_
#define __FUNC_PREFIX_CORE __spirv_
#define __FUNC_PREFIX_GENERIC __clc_
#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1)
#define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2)
#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg)
#define __SYCL_EXTERN_IT3(Ret, prefix, call, Arg1, Arg2, Arg3)
#else
#define __FUNC_PREFIX_OCL
#define __FUNC_PREFIX_CORE
#define __FUNC_PREFIX_GENERIC
#define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg) \
extern Ret __SYCL_PPCAT(prefix, call)(Arg)
#define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg) \
Expand Down Expand Up @@ -134,6 +136,7 @@ __SYCL_MAKE_CALL_ARG1(fabs, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG2(fdim, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG1(floor, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG3(fma, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG3(fma_relu, __FUNC_PREFIX_GENERIC)
__SYCL_MAKE_CALL_ARG2(fmax, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG2(fmin, __FUNC_PREFIX_OCL)
__SYCL_MAKE_CALL_ARG2(fmod, __FUNC_PREFIX_OCL)
Expand Down
13 changes: 13 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,25 @@
#define __SYCL_CONSTANT_AS
#endif

// TODO Decide whether to mark functions with this attribute.
#define __NOEXC /*noexcept*/

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace ext {
namespace oneapi {
namespace experimental {

// fma_relu returns a * b + c > 0 ? a * b + c : 0
template <typename T>
sycl::detail::enable_if_t<sycl::detail::is_genfloath<T>::value ||
sycl::detail::is_ugenshort<T>::value ||
sycl::detail::is_ugenint<T>::value,
T>
fma_relu(T a, T b, T c) __NOEXC {
return __sycl_std::__invoke_fma_relu<T>(a, b, c);
}

// Provides functionality to print data from kernels in a C way:
// - On non-host devices this function is directly mapped to printf from
// OpenCL C
Expand Down
22 changes: 22 additions & 0 deletions sycl/source/detail/builtins_math.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,28 @@ MAKE_1V_2V_3V(fma, s::cl_float, s::cl_float, s::cl_float, s::cl_float)
MAKE_1V_2V_3V(fma, s::cl_double, s::cl_double, s::cl_double, s::cl_double)
MAKE_1V_2V_3V(fma, s::cl_half, s::cl_half, s::cl_half, s::cl_half)

// fma_relu
__SYCL_EXPORT s::cl_half fma_relu(s::cl_half a, s::cl_half b,
s::cl_half c) __NOEXC {
auto ans = std::fma(a, b, c);
return (ans > 0) ? ans : 0;
}
__SYCL_EXPORT s::cl_ushort fma_relu(s::cl_ushort a, s::cl_ushort b,
s::cl_ushort c) __NOEXC {
// TODO implement this once bfloat16 datatype is supported on host
throw std::runtime_error(
"fma_relu not supported on host for bfloat16 datatype.");
}
__SYCL_EXPORT s::cl_uint fma_relu(s::cl_uint a, s::cl_uint b,
s::cl_uint c) __NOEXC {
// TODO implement this once bfloat16x2 datatype is supported on host
throw std::runtime_error(
"fma_relu not supported on host for bfloat16x2 datatype.");
}
MAKE_1V_2V_3V(fma_relu, s::cl_ushort, s::cl_ushort, s::cl_ushort, s::cl_ushort)
MAKE_1V_2V_3V(fma_relu, s::cl_uint, s::cl_uint, s::cl_uint, s::cl_uint)
MAKE_1V_2V_3V(fma_relu, s::cl_half, s::cl_half, s::cl_half, s::cl_half)

// fmax
__SYCL_EXPORT s::cl_float fmax(s::cl_float x, s::cl_float y) __NOEXC {
return std::fmax(x, y);
Expand Down