diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp index a8251a244eeb0..43fd8f5484d28 100644 --- a/sycl/include/CL/sycl/builtins.hpp +++ b/sycl/include/CL/sycl/builtins.hpp @@ -46,9 +46,7 @@ namespace __sycl_std = __host_std; // size two as a simple general optimization. A more complex implementation // using larger vectorizations for large marray sizes is possible; however more // testing is required in order to ascertain the performance implications for -// all backends. Currently the compiler does not produce vectorized loads and -// stores from this implementation for all backends. It would be wise to -// investigate how this can be fixed first. +// all backends. #define __SYCL_MATH_FUNCTION_OVERLOAD(NAME) \ template \ inline __SYCL_ALWAYS_INLINE \ diff --git a/sycl/include/CL/sycl/marray.hpp b/sycl/include/CL/sycl/marray.hpp index 6bc2c7bfffe47..f5911e6518d4f 100644 --- a/sycl/include/CL/sycl/marray.hpp +++ b/sycl/include/CL/sycl/marray.hpp @@ -17,6 +17,29 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { +template +constexpr std::size_t vecAlignment() { + static_assert(N > 0, "Invalid number of elements."); + static_assert(SizeOfT > 0, "Invalid size of T."); + // First find the "previous" vector num elements. + size_t res = N >= 16 ? 16 + : N >= 8 ? 8 + : N >= 4 ? 4 + : N >= 3 ? 3 + : N >= 2 ? 2 + : 1; + // Then calculate the alignment size in bytes, making sure it's power of 2. + res *= SizeOfT; + res--; + res |= res >> 1; + res |= res >> 2; + res |= res >> 4; + res |= res >> 8; + res |= res >> 16; + res++; + return res; +} + /// Provides a cross-patform math array class template that works on /// SYCL devices as well as in host C++ code. /// @@ -298,7 +321,7 @@ template class marray { } return Ret; } -}; +} __attribute__((aligned(vecAlignment()))); #define __SYCL_MAKE_MARRAY_ALIAS(ALIAS, TYPE, N) \ using ALIAS##N = cl::sycl::marray; diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index 82c505a4a73d5..53723e64b88da 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -123,23 +123,22 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t::value || tanh(sycl::marray x) __NOEXC { sycl::marray res; #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - for (size_t i = 0; i < N / 2; i++) { - auto partial_res = native::tanh(sycl::detail::to_vec2(x, i * 2)); - std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); - } - if (N % 2) { - res[N - 1] = native::tanh(x[N - 1]); - } +#define FUNC_VEC native::tanh +#define FUNC FUNC_VEC #else +#define FUNC_VEC __sycl_std::__invoke_tanh> +#define FUNC __sycl_std::__invoke_tanh +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + for (size_t i = 0; i < N / 2; i++) { - auto partial_res = __sycl_std::__invoke_tanh>( - sycl::detail::to_vec2(x, i * 2)); + auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2)); std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); } if (N % 2) { - res[N - 1] = __sycl_std::__invoke_tanh(x[N - 1]); + res[N - 1] = FUNC(x[N - 1]); } -#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#undef FUNC_VEC +#undef FUNC return res; } @@ -163,23 +162,22 @@ inline __SYCL_ALWAYS_INLINE sycl::marray exp2(sycl::marray x) __NOEXC { sycl::marray res; #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) - for (size_t i = 0; i < N / 2; i++) { - auto partial_res = native::exp2(sycl::detail::to_vec2(x, i * 2)); - std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); - } - if (N % 2) { - res[N - 1] = native::exp2(x[N - 1]); - } +#define FUNC_VEC native::exp2 +#define FUNC FUNC_VEC #else +#define FUNC_VEC __sycl_std::__invoke_exp2> +#define FUNC __sycl_std::__invoke_exp2 +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + for (size_t i = 0; i < N / 2; i++) { - auto partial_res = __sycl_std::__invoke_exp2>( - sycl::detail::to_vec2(x, i * 2)); + auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2)); std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); } if (N % 2) { - res[N - 1] = __sycl_std::__invoke_exp2(x[N - 1]); + res[N - 1] = FUNC(x[N - 1]); } -#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#undef FUNC_VEC +#undef FUNC return res; }