diff --git a/libclc/generic/include/spirv/spirv_builtins.h b/libclc/generic/include/spirv/spirv_builtins.h index ca15fdf3c6547..fdeafa79818b9 100644 --- a/libclc/generic/include/spirv/spirv_builtins.h +++ b/libclc/generic/include/spirv/spirv_builtins.h @@ -14146,6 +14146,22 @@ _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t __spirv_ocl_fma(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t); #endif +#ifdef cl_khr_fp16 +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp16_t __clc_fma_relu(__clc_fp16_t, + __clc_fp16_t, + __clc_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp16_t + __clc_fma_relu(__clc_vec2_fp16_t, __clc_vec2_fp16_t, __clc_vec2_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec3_fp16_t + __clc_fma_relu(__clc_vec3_fp16_t, __clc_vec3_fp16_t, __clc_vec3_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec4_fp16_t + __clc_fma_relu(__clc_vec4_fp16_t, __clc_vec4_fp16_t, __clc_vec4_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec8_fp16_t + __clc_fma_relu(__clc_vec8_fp16_t, __clc_vec8_fp16_t, __clc_vec8_fp16_t); +_CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec16_fp16_t + __clc_fma_relu(__clc_vec16_fp16_t, __clc_vec16_fp16_t, __clc_vec16_fp16_t); +#endif + _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_fp32_t __spirv_ocl_fmax(__clc_fp32_t, __clc_fp32_t); _CLC_OVERLOAD _CLC_DECL _CLC_CONSTFN __clc_vec2_fp32_t diff --git a/libclc/generic/libspirv/float16.cl b/libclc/generic/libspirv/float16.cl index 28f5b65ac80e2..f8a45db707fac 100644 --- a/libclc/generic/libspirv/float16.cl +++ b/libclc/generic/libspirv/float16.cl @@ -4570,6 +4570,42 @@ __spirv_ocl_fma(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1, as_half16(args_2)); } +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __clc_fma_relu( + __clc_float16_t args_0, __clc_float16_t args_1, __clc_float16_t args_2) { + return __clc_fma_relu(as_half(args_0), as_half(args_1), as_half(args_2)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec2_fp16_t +__clc_fma_relu(__clc_vec2_float16_t args_0, __clc_vec2_float16_t args_1, + __clc_vec2_float16_t args_2) { + return __clc_fma_relu(as_half2(args_0), as_half2(args_1), as_half2(args_2)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec3_fp16_t +__clc_fma_relu(__clc_vec3_float16_t args_0, __clc_vec3_float16_t args_1, + __clc_vec3_float16_t args_2) { + return __clc_fma_relu(as_half3(args_0), as_half3(args_1), as_half3(args_2)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec4_fp16_t +__clc_fma_relu(__clc_vec4_float16_t args_0, __clc_vec4_float16_t args_1, + __clc_vec4_float16_t args_2) { + return __clc_fma_relu(as_half4(args_0), as_half4(args_1), as_half4(args_2)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec8_fp16_t +__clc_fma_relu(__clc_vec8_float16_t args_0, __clc_vec8_float16_t args_1, + __clc_vec8_float16_t args_2) { + return __clc_fma_relu(as_half8(args_0), as_half8(args_1), as_half8(args_2)); +} + +_CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_vec16_fp16_t +__clc_fma_relu(__clc_vec16_float16_t args_0, __clc_vec16_float16_t args_1, + __clc_vec16_float16_t args_2) { + return __clc_fma_relu(as_half16(args_0), as_half16(args_1), + as_half16(args_2)); +} + _CLC_OVERLOAD _CLC_DEF _CLC_CONSTFN __clc_fp16_t __spirv_ocl_fmax(__clc_float16_t args_0, __clc_float16_t args_1) { return __spirv_ocl_fmax(as_half(args_0), as_half(args_1)); diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc new file mode 100644 index 0000000000000..175219e23c47f --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16.asciidoc @@ -0,0 +1,336 @@ += sycl_ext_oneapi_bfloat16 + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en + +:blank: pass:[ +] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + +// This is necessary for asciidoc, but not for asciidoctor +:cpp: C++ + +== Notice + +IMPORTANT: This specification is a draft. + +Copyright (c) 2021-2022 Intel Corporation. All rights reserved. + +NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are +trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. +used by permission by Khronos. + +== Dependencies + +This extension is written against the SYCL 2020 specification, Revision 4. + +== Status + +Draft + +This is a preview extension specification, intended to provide early access to +a feature for review and community feedback. When the feature matures, this +specification may be released as a formal extension. + +Because the interfaces defined by this specification are not final and are +subject to change they are not intended to be used by shipping software +products. + +== Version + +Revision: 4 + +== Introduction + +This extension adds functionality to convert value of single-precision +floating-point type(`float`) to `bfloat16` type and vice versa. The extension +doesn't add support for `bfloat16` type as such, instead it uses 16-bit integer +type(`uint16_t`) as a storage for `bfloat16` values. + +The purpose of conversion from float to bfloat16 is to reduce ammount of memory +required to store floating-point numbers. Computations are expected to be done with +32-bit floating-point values. + +This extension is an optional kernel feature as described in +https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:optional-kernel-features[section 5.7] +of the SYCL 2020 spec. Therefore, attempting to submit a kernel using this +feature to a device that does not support it should cause a synchronous +`errc::kernel_not_supported` exception to be thrown from the kernel invocation +command (e.g. from `parallel_for`). + +== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification section 6.3.3 "Feature test macros". Therefore, an implementation +supporting this extension must predefine the macro +`SYCL_EXT_ONEAPI_BFLOAT16` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro’s + value to determine which of the extension’s APIs the implementation supports. + +[%header,cols="1,5"] +|=== +|Value |Description +|1 |Initial extension version. Base features are supported. +|=== + +== Extension to `enum class aspect` + +[source] +---- +namespace sycl { +enum class aspect { + ... + ext_oneapi_bfloat16 +} +} +---- + +If a SYCL device has the `ext_oneapi_bfloat16` aspect, then it natively +supports conversion of values of `float` type to `bfloat16` and back. + +If the device doesn't have the aspect, objects of `bfloat16` class must not be +used in the device code. + +**NOTE**: The `ext_oneapi_bfloat16` aspect is not yet supported. The +`bfloat16` class is currently supported only on Xe HP GPU and Nvidia A100 GPU. + +== New `bfloat16` class + +The `bfloat16` class below provides the conversion functionality. Conversion +from `float` to `bfloat16` is done with round to nearest even(RTE) rounding +mode. + +[source] +---- +namespace sycl { +namespace ext { +namespace oneapi { +namespace experimental { + +class bfloat16 { + using storage_t = uint16_t; + storage_t value; + +public: + bfloat16() = default; + bfloat16(const bfloat16 &) = default; + ~bfloat16() = default; + + // Explicit conversion functions + static storage_t from_float(const float &a); + static float to_float(const storage_t &a); + + // Convert from float to bfloat16 + bfloat16(const float &a); + bfloat16 &operator=(const float &a); + + // Convert from bfloat16 to float + operator float() const; + + // Get bfloat16 as uint16. + operator storage_t() const; + + // Convert to bool type + explicit operator bool(); + + friend bfloat16 operator-(bfloat16 &bf) { /* ... */ } + + // OP is: prefix ++, -- + friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ } + + // OP is: postfix ++, -- + friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ } + + // OP is: +=, -=, *=, /= + friend bfloat16 &operatorOP(bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ } + + // OP is +, -, *, / + friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ } + template + friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ } + template + friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ } + + // OP is ==,!=, <, >, <=, >= + friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ } + template + friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ } + template + friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ } +}; + +} // namespace experimental +} // namespace oneapi +} // namespace ext +} // namespace sycl +---- + +Table 1. Member functions of `bfloat16` class. +|=== +| Member Function | Description + +| `static storage_t from_float(const float &a);` +| Explicitly convert from `float` to `bfloat16`. + +| `static float to_float(const storage_t &a);` +| Interpret `a` as `bfloat16` and explicitly convert it to `float`. + +| `bfloat16(const float& a);` +| Construct `bfloat16` from `float`. Converts `float` to `bfloat16`. + +| `bfloat16 &operator=(const float &a);` +| Replace the value with `a` converted to `bfloat16` + +| `operator float() const;` +| Return `bfloat16` value converted to `float`. + +| `operator storage_t() const;` +| Return `uint16_t` value, whose bits represent `bfloat16` value. + +| `explicit operator bool() { /* ... */ }` +| Convert `bfloat16` to `bool` type. Return `false` if the value equals to + zero, return `true` otherwise. + +| `friend bfloat16 operator-(bfloat16 &bf) { /* ... */ }` +| Construct new instance of `bfloat16` class with negated value of the `bf`. + +| `friend bfloat16 &operatorOP(bfloat16 &bf) { /* ... */ }` +| Perform an in-place `OP` prefix arithmetic operation on the `bf`, + assigning the result to the `bf` and return the `bf`. + + OP is: `++, --` + +| `friend bfloat16 operatorOP(bfloat16 &bf, int) { /* ... */ }` +| Perform an in-place `OP` postfix arithmetic operation on `bf`, assigning + the result to the `bf` and return a copy of `bf` before the operation is + performed. + + OP is: `++, --` + +| `friend bfloat16 operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }` +| Perform an in-place `OP` arithmetic operation between the `lhs` and the `rhs` + and return the `lhs`. + + OP is: `+=, -=, *=, /=` + +| `friend type operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }` +| Construct a new instance of the `bfloat16` class with the value of the new + `bfloat16` instance being the result of an OP arithmetic operation between + the `lhs` `bfloat16` and `rhs` `bfloat16` values. + + OP is `+, -, *, /` + +| `template + friend bfloat16 operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }` +| Construct a new instance of the `bfloat16` class with the value of the new + `bfloat16` instance being the result of an OP arithmetic operation between + the `lhs` `bfloat16` value and `rhs` of template type `T`. Type `T` must be + convertible to `float`. + + OP is `+, -, *, /` + +| `template + friend bfloat16 operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }` +| Construct a new instance of the `bfloat16` class with the value of the new + `bfloat16` instance being the result of an OP arithmetic operation between + the `lhs` of template type `T` and `rhs` `bfloat16` value. Type `T` must be + convertible to `float`. + + OP is `+, -, *, /` + +| `friend bool operatorOP(const bfloat16 &lhs, const bfloat16 &rhs) { /* ... */ }` +| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` `bfloat16` + values and return the result as a boolean value. + +OP is `==, !=, <, >, <=, >=` + +| `template + friend bool operatorOP(const bfloat16 &lhs, const T &rhs) { /* ... */ }` +| Perform comparison operation OP between `lhs` `bfloat16` and `rhs` of + template type `T` and return the result as a boolean value. Type `T` must be + convertible to `float`. + +OP is `==, !=, <, >, <=, >=` + +| `template + friend bool operatorOP(const T &lhs, const bfloat16 &rhs) { /* ... */ }` +| Perform comparison operation OP between `lhs` of template type `T` and `rhs` + `bfloat16` value and return the result as a boolean value. Type `T` must be + convertible to `float`. + +OP is `==, !=, <, >, <=, >=` +|=== + +== Example + +[source] +---- +#include +#include + +using sycl::ext::oneapi::experimental::bfloat16; + +bfloat16 operator+(const bfloat16 &lhs, const bfloat16 &rhs) { + return static_cast(lhs) + static_cast(rhs); +} + +float foo(float a, float b) { + // Convert from float to bfloat16. + bfloat16 A {a}; + bfloat16 B {b}; + + // Convert A and B from bfloat16 to float, do addition on floating-pointer + // numbers, then convert the result to bfloat16 and store it in C. + bfloat16 C = A + B; + + // Return the result converted from bfloat16 to float. + return C; +} + +int main (int argc, char *argv[]) { + float data[3] = {7.0, 8.1, 0.0}; + sycl::device dev; + sycl::queue deviceQueue{dev}; + sycl::buffer buf {data, sycl::range<1> {3}}; + + if (dev.has(sycl::aspect::ext_oneapi_bfloat16)) { + deviceQueue.submit ([&] (sycl::handler& cgh) { + auto numbers = buf.get_access (cgh); + cgh.single_task ([=] () { + numbers[2] = foo(numbers[0], numbers[1]); + }); + }); + } + return 0; +} +---- + +== Issues + +None. + +== Revision History + +[cols="5,15,15,70"] +[grid="rows"] +[options="header"] +|======================================== +|Rev|Date|Author|Changes +|1|2021-08-02|Alexey Sotkin |Initial public working draft +|2|2021-08-17|Alexey Sotkin |Add explicit conversion functions + + Add operator overloadings + + Apply code review suggestions +|3|2021-08-18|Alexey Sotkin |Remove `uint16_t` constructor +|4|2022-03-07|Aidan Belton and Jack Kirk |Switch from Intel vendor specific to oneapi +|======================================== diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc new file mode 100644 index 0000000000000..1e79434a51815 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_fma_relu.asciidoc @@ -0,0 +1,122 @@ += sycl_ext_oneapi_fma_relu + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2022-2022 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + +or contact hugh 'dot' delaney 'at' codeplay 'dot' com. + +== Dependencies + +This extension is written against the SYCL 2020 revision 4 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + +For the `bfloat16` cases this extension depends on the following other SYCL +extensions: + +* link:./sycl_ext_intel_bf16_conversion.asciidoc[ + sycl_ext_*_bf16_conversion] + +For the `half` cases this extension requires the runtime aspect +`sycl::aspect::fp16`. + +== Contributors + +* Hugh Delaney + +== Status + +This is a proposed extension specification, intended to gather community +feedback. Interfaces defined in this specification may not be implemented yet +or may be in a preliminary state. The specification itself may also change in +incompatible ways before it is finalized. *Shipping software products should +not rely on APIs defined in this specification.* + +[NOTE] +==== +This extension is currently implemented in {dpcpp} only for GPU devices and +only when using the CUDA backend. Attempting to use this extension in +kernels that run on other devices or backends may result in undefined behavior. +Be aware that the compiler is not able to issue a diagnostic to warn you if +this happens. +==== + + +== Overview + +This extension introduces the `fma_relu` function for datatypes `sycl::half`, +`bfloat16` and `bfloat16x2`. `bfloat16` refers to the bfloat16 class from +the `sycl_ext_oneapi_bfloat16` extension. `bfloat16x2` has not yet been +implemented as a class so this `fma_relu` extension uses `uint32_t` as a +storage type for `bfloat16x2`. This will be changed once the `bfloat16x2` +class has been implemented. + +== Specification + +```c++ +namespace sycl::ext::oneapi::experimental { + +// Available when T is sycl::half, uint16_t (bfloat16) or uint32_t (bfloat16x2) +template +T fma_relu(T a, T b, T c); +} +``` + +`fma_relu` returns `a * b + c > 0 ? a * b + c : 0`. + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_FMA_RELU` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. + +If `fma_relu` is to be used with either the `bf16` or `bf16x2` datatypes, then +an implementation must additionally predefine the macro +`SYCL_EXT_ONEAPI_BFLOAT16`, as detailed in +link:./sycl_ext_intel_bf16_conversion.asciidoc[ + sycl_ext_oneapi_bfloat16]. + + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 3c6ce1c639960..2d0b2f0036cb3 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -755,6 +755,57 @@ __spirv_ocl_printf(const __attribute__((opencl_constant)) char *Format, ...); extern SYCL_EXTERNAL int __spirv_ocl_printf(const char *Format, ...); #endif +extern SYCL_EXTERNAL _Float16 __clc_fma_relu(_Float16, _Float16, _Float16); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 2> + __clc_fma_relu(__ocl_vec_t<_Float16, 2>, __ocl_vec_t<_Float16, 2>, + __ocl_vec_t<_Float16, 2>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 3> + __clc_fma_relu(__ocl_vec_t<_Float16, 3>, __ocl_vec_t<_Float16, 3>, + __ocl_vec_t<_Float16, 3>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 4> + __clc_fma_relu(__ocl_vec_t<_Float16, 4>, __ocl_vec_t<_Float16, 4>, + __ocl_vec_t<_Float16, 4>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 8> + __clc_fma_relu(__ocl_vec_t<_Float16, 8>, __ocl_vec_t<_Float16, 8>, + __ocl_vec_t<_Float16, 8>); +extern SYCL_EXTERNAL __ocl_vec_t<_Float16, 16> + __clc_fma_relu(__ocl_vec_t<_Float16, 16>, __ocl_vec_t<_Float16, 16>, + __ocl_vec_t<_Float16, 16>); + +extern SYCL_EXTERNAL uint16_t __clc_fma_relu(uint16_t, uint16_t, uint16_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); + +extern SYCL_EXTERNAL uint32_t __clc_fma_relu(uint32_t, uint32_t, uint32_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); +extern SYCL_EXTERNAL __ocl_vec_t + __clc_fma_relu(__ocl_vec_t, __ocl_vec_t, + __ocl_vec_t); + // Native builtin extension extern SYCL_EXTERNAL float __clc_native_tanh(float); diff --git a/sycl/include/CL/sycl/detail/builtins.hpp b/sycl/include/CL/sycl/detail/builtins.hpp index f4f0475ea905b..a6ca592bacb42 100644 --- a/sycl/include/CL/sycl/detail/builtins.hpp +++ b/sycl/include/CL/sycl/detail/builtins.hpp @@ -20,6 +20,7 @@ #ifdef __SYCL_DEVICE_ONLY__ #define __FUNC_PREFIX_OCL __spirv_ocl_ #define __FUNC_PREFIX_CORE __spirv_ +#define __FUNC_PREFIX_GENERIC __clc_ #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg1) #define __SYCL_EXTERN_IT2(Ret, prefix, call, Arg1, Arg2) #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg) @@ -27,6 +28,7 @@ #else #define __FUNC_PREFIX_OCL #define __FUNC_PREFIX_CORE +#define __FUNC_PREFIX_GENERIC #define __SYCL_EXTERN_IT1(Ret, prefix, call, Arg) \ extern Ret __SYCL_PPCAT(prefix, call)(Arg) #define __SYCL_EXTERN_IT2_SAME(Ret, prefix, call, Arg) \ @@ -134,6 +136,7 @@ __SYCL_MAKE_CALL_ARG1(fabs, __FUNC_PREFIX_OCL) __SYCL_MAKE_CALL_ARG2(fdim, __FUNC_PREFIX_OCL) __SYCL_MAKE_CALL_ARG1(floor, __FUNC_PREFIX_OCL) __SYCL_MAKE_CALL_ARG3(fma, __FUNC_PREFIX_OCL) +__SYCL_MAKE_CALL_ARG3(fma_relu, __FUNC_PREFIX_GENERIC) __SYCL_MAKE_CALL_ARG2(fmax, __FUNC_PREFIX_OCL) __SYCL_MAKE_CALL_ARG2(fmin, __FUNC_PREFIX_OCL) __SYCL_MAKE_CALL_ARG2(fmod, __FUNC_PREFIX_OCL) diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in index 6c58b98d29f1f..516464e1b9fe3 100644 --- a/sycl/include/CL/sycl/feature_test.hpp.in +++ b/sycl/include/CL/sycl/feature_test.hpp.in @@ -55,7 +55,7 @@ namespace sycl { #define SYCL_EXT_ONEAPI_SUB_GROUP 1 #define SYCL_EXT_ONEAPI_PROPERTIES 1 #define SYCL_EXT_ONEAPI_NATIVE_MATH 1 -#define SYCL_EXT_INTEL_BF16_CONVERSION 1 +#define SYCL_EXT_ONEAPI_BF16_CONVERSION 1 #define SYCL_EXT_INTEL_DATAFLOW_PIPES 1 #ifdef __clang__ #if __has_extension(sycl_extended_atomics) diff --git a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp similarity index 90% rename from sycl/include/sycl/ext/intel/experimental/bfloat16.hpp rename to sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp index 5a51f3746e225..1190c80631928 100644 --- a/sycl/include/sycl/ext/intel/experimental/bfloat16.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16.hpp @@ -14,10 +14,10 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { -namespace intel { +namespace oneapi { namespace experimental { -class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 { +class bfloat16 { using storage_t = uint16_t; storage_t value; @@ -29,7 +29,11 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 { // Explicit conversion functions static storage_t from_float(const float &a) { #if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + return __nvvm_f2bf16_rn(a); +#else return __spirv_ConvertFToBF16INTEL(a); +#endif #else throw exception{errc::feature_not_supported, "Bfloat16 conversion is not supported on host device"}; @@ -37,7 +41,14 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 { } static float to_float(const storage_t &a) { #if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + uint32_t y = a; + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +#else return __spirv_ConvertBF16ToFINTEL(a); +#endif #else throw exception{errc::feature_not_supported, "Bfloat16 conversion is not supported on host device"}; @@ -70,7 +81,16 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 { // Unary minus operator overloading friend bfloat16 operator-(bfloat16 &lhs) { - return bfloat16{-to_float(lhs.value)}; +#if defined(__SYCL_DEVICE_ONLY__) +#if defined(__NVPTX__) + return from_bits(__nvvm_neg_bf16(lhs.value)); +#else + return bfloat16{-__spirv_ConvertBF16ToFINTEL(lhs.value)}; +#endif +#else + throw exception{errc::feature_not_supported, + "Bfloat16 unary minus is not supported on host device"}; +#endif } // Increment and decrement operators overloading @@ -143,7 +163,7 @@ class [[sycl_detail::uses_aspects(ext_intel_bf16_conversion)]] bfloat16 { }; } // namespace experimental -} // namespace intel +} // namespace oneapi } // namespace ext } // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index c8fa033d8c79e..5c272e4f8132b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -16,6 +16,8 @@ #include +#include + // TODO Decide whether to mark functions with this attribute. #define __NOEXC /*noexcept*/ @@ -25,12 +27,30 @@ #define __SYCL_CONSTANT_AS #endif +// TODO Decide whether to mark functions with this attribute. +#define __NOEXC /*noexcept*/ + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace ext { namespace oneapi { namespace experimental { +// fma_relu returns a * b + c > 0 ? a * b + c : 0 +template +sycl::detail::enable_if_t::value || + sycl::detail::is_ugenint::value || + std::is_same::value, + T> +fma_relu(T a, T b, T c) __NOEXC { + if constexpr (std::is_same::value) { + return bfloat16::from_bits( + __sycl_std::__invoke_fma_relu(a.raw(), b.raw(), c.raw())); + } else { + return __sycl_std::__invoke_fma_relu(a, b, c); + } +} + // Provides functionality to print data from kernels in a C way: // - On non-host devices this function is directly mapped to printf from // OpenCL C @@ -40,9 +60,9 @@ namespace experimental { // Please refer to corresponding section in OpenCL C specification to find // information about format string and its differences from standard C rules. // -// This function is placed under 'experimental' namespace on purpose, because it -// has too much caveats you need to be aware of before using it. Please find -// them below and read carefully before using it: +// This function is placed under 'experimental' namespace on purpose, because +// it has too much caveats you need to be aware of before using it. Please +// find them below and read carefully before using it: // // - According to the OpenCL spec, the format string must be // resolvable at compile time i.e. cannot be dynamically created by the @@ -52,19 +72,19 @@ namespace experimental { // address space. The constant address space declarations might get "tricky", // see test/built-ins/printf.cpp for examples. // In simple cases (compile-time known string contents, direct declaration of -// the format literal inside the printf call, etc.), the compiler should handle -// the automatic address space conversion. +// the format literal inside the printf call, etc.), the compiler should +// handle the automatic address space conversion. // FIXME: Once the extension to generic address space is fully supported, the // constant AS version may need to be deprecated. // -// - The format string is interpreted according to the OpenCL C spec, where all -// data types has fixed size, opposed to C++ types which doesn't guarantee +// - The format string is interpreted according to the OpenCL C spec, where +// all data types has fixed size, opposed to C++ types which doesn't guarantee // the exact width of particular data types (except, may be, char). This might // lead to unexpected result, for example: %ld in OpenCL C means that printed -// argument has 'long' type which is 64-bit wide by the OpenCL C spec. However, -// by C++ spec long is just at least 32-bit wide, so, you need to ensure (by -// performing a cast, for example) that if you use %ld specifier, you pass -// 64-bit argument to the cl::sycl::experimental::printf +// argument has 'long' type which is 64-bit wide by the OpenCL C spec. +// However, by C++ spec long is just at least 32-bit wide, so, you need to +// ensure (by performing a cast, for example) that if you use %ld specifier, +// you pass 64-bit argument to the cl::sycl::experimental::printf // // - OpenCL spec defines several additional features, like, for example, 'v' // modifier which allows to print OpenCL vectors: note that these features are diff --git a/sycl/source/detail/builtins_math.cpp b/sycl/source/detail/builtins_math.cpp index 5a78d6cb80a5b..9c7c914db0df1 100644 --- a/sycl/source/detail/builtins_math.cpp +++ b/sycl/source/detail/builtins_math.cpp @@ -359,6 +359,28 @@ MAKE_1V_2V_3V(fma, s::cl_float, s::cl_float, s::cl_float, s::cl_float) MAKE_1V_2V_3V(fma, s::cl_double, s::cl_double, s::cl_double, s::cl_double) MAKE_1V_2V_3V(fma, s::cl_half, s::cl_half, s::cl_half, s::cl_half) +// fma_relu +__SYCL_EXPORT s::cl_half fma_relu(s::cl_half a, s::cl_half b, + s::cl_half c) __NOEXC { + auto ans = std::fma(a, b, c); + return (ans > 0) ? ans : 0; +} +__SYCL_EXPORT s::cl_ushort fma_relu(s::cl_ushort a, s::cl_ushort b, + s::cl_ushort c) __NOEXC { + // TODO implement this once bfloat16 datatype is supported on host + throw std::runtime_error( + "fma_relu not supported on host for bfloat16 datatype."); +} +__SYCL_EXPORT s::cl_uint fma_relu(s::cl_uint a, s::cl_uint b, + s::cl_uint c) __NOEXC { + // TODO implement this once bfloat16x2 datatype is supported on host + throw std::runtime_error( + "fma_relu not supported on host for bfloat16x2 datatype."); +} +MAKE_1V_2V_3V(fma_relu, s::cl_ushort, s::cl_ushort, s::cl_ushort, s::cl_ushort) +MAKE_1V_2V_3V(fma_relu, s::cl_uint, s::cl_uint, s::cl_uint, s::cl_uint) +MAKE_1V_2V_3V(fma_relu, s::cl_half, s::cl_half, s::cl_half, s::cl_half) + // fmax __SYCL_EXPORT s::cl_float fmax(s::cl_float x, s::cl_float y) __NOEXC { return std::fmax(x, y); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 79d58722b4f4f..c2071e1df17a1 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2932,6 +2932,27 @@ _ZN2cl10__host_std7u_rhaddEhh _ZN2cl10__host_std7u_rhaddEjj _ZN2cl10__host_std7u_rhaddEmm _ZN2cl10__host_std7u_rhaddEtt +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi3EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi8EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl6detail9half_impl4halfES4_S4_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi3EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi16EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi4EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi1EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi4EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi3EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi16EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecINS1_6detail9half_impl4halfELi8EEES6_S6_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi2EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi1EEES3_S3_ +_ZN2cl10__host_std8fma_reluEttt +_ZN2cl10__host_std8fma_reluEjjj +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi4EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecItLi8EEES3_S3_ +_ZN2cl10__host_std8fma_reluENS_4sycl3vecIjLi2EEES3_S3_ _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi16EEE _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi1EEE _ZN2cl10__host_std8IsFiniteENS_4sycl3vecINS1_6detail9half_impl4halfELi2EEE diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp index 80696140b2dc8..847be9508bdb5 100644 --- a/sycl/test/extensions/bfloat16.cpp +++ b/sycl/test/extensions/bfloat16.cpp @@ -2,10 +2,10 @@ // UNSUPPORTED: cuda || hip_amd -#include +#include #include -using sycl::ext::intel::experimental::bfloat16; +using sycl::ext::oneapi::experimental::bfloat16; SYCL_EXTERNAL uint16_t some_bf16_intrinsic(uint16_t x, uint16_t y); SYCL_EXTERNAL void foo(long x, sycl::half y);