Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions sycl/include/CL/sycl/builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ namespace __sycl_std = __host_std;
// size two as a simple general optimization. A more complex implementation
// using larger vectorizations for large marray sizes is possible; however more
// testing is required in order to ascertain the performance implications for
// all backends. Currently the compiler does not produce vectorized loads and
// stores from this implementation for all backends. It would be wise to
// investigate how this can be fixed first.
// all backends.
#define __SYCL_MATH_FUNCTION_OVERLOAD(NAME) \
template <typename T, size_t N> \
inline __SYCL_ALWAYS_INLINE \
Expand Down
25 changes: 24 additions & 1 deletion sycl/include/CL/sycl/marray.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,29 @@
__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {

template <std::size_t N, std::size_t SizeOfT>
constexpr std::size_t vecAlignment() {
static_assert(N > 0, "Invalid number of elements.");
static_assert(SizeOfT > 0, "Invalid size of T.");
// First find the "previous" vector num elements.
size_t res = N >= 16 ? 16
: N >= 8 ? 8
: N >= 4 ? 4
: N >= 3 ? 3
: N >= 2 ? 2
: 1;
// Then calculate the alignment size in bytes, making sure it's power of 2.
res *= SizeOfT;
res--;
res |= res >> 1;
res |= res >> 2;
res |= res >> 4;
res |= res >> 8;
res |= res >> 16;
res++;
return res;
}

/// Provides a cross-patform math array class template that works on
/// SYCL devices as well as in host C++ code.
///
Expand Down Expand Up @@ -298,7 +321,7 @@ template <typename Type, std::size_t NumElements> class marray {
}
return Ret;
}
};
} __attribute__((aligned(vecAlignment<NumElements, sizeof(Type)>())));

#define __SYCL_MAKE_MARRAY_ALIAS(ALIAS, TYPE, N) \
using ALIAS##N = cl::sycl::marray<TYPE, N>;
Expand Down
42 changes: 20 additions & 22 deletions sycl/include/sycl/ext/oneapi/experimental/builtins.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,23 +123,22 @@ inline __SYCL_ALWAYS_INLINE std::enable_if_t<std::is_same<T, half>::value ||
tanh(sycl::marray<T, N> x) __NOEXC {
sycl::marray<T, N> res;
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
for (size_t i = 0; i < N / 2; i++) {
auto partial_res = native::tanh(sycl::detail::to_vec2(x, i * 2));
std::memcpy(&res[i * 2], &partial_res, sizeof(vec<T, 2>));
}
if (N % 2) {
res[N - 1] = native::tanh(x[N - 1]);
}
#define FUNC_VEC native::tanh
#define FUNC FUNC_VEC
#else
#define FUNC_VEC __sycl_std::__invoke_tanh<sycl::vec<T, 2>>
#define FUNC __sycl_std::__invoke_tanh<T>
#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)

for (size_t i = 0; i < N / 2; i++) {
auto partial_res = __sycl_std::__invoke_tanh<sycl::vec<T, 2>>(
sycl::detail::to_vec2(x, i * 2));
auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2));
std::memcpy(&res[i * 2], &partial_res, sizeof(vec<T, 2>));
}
if (N % 2) {
res[N - 1] = __sycl_std::__invoke_tanh<T>(x[N - 1]);
res[N - 1] = FUNC(x[N - 1]);
}
#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
#undef FUNC_VEC
#undef FUNC
return res;
}

Expand All @@ -163,23 +162,22 @@ inline __SYCL_ALWAYS_INLINE sycl::marray<half, N>
exp2(sycl::marray<half, N> x) __NOEXC {
sycl::marray<half, N> res;
#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
for (size_t i = 0; i < N / 2; i++) {
auto partial_res = native::exp2(sycl::detail::to_vec2(x, i * 2));
std::memcpy(&res[i * 2], &partial_res, sizeof(vec<half, 2>));
}
if (N % 2) {
res[N - 1] = native::exp2(x[N - 1]);
}
#define FUNC_VEC native::exp2
#define FUNC FUNC_VEC
#else
#define FUNC_VEC __sycl_std::__invoke_exp2<sycl::vec<half, 2>>
#define FUNC __sycl_std::__invoke_exp2<half>
#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)

for (size_t i = 0; i < N / 2; i++) {
auto partial_res = __sycl_std::__invoke_exp2<sycl::vec<half, 2>>(
sycl::detail::to_vec2(x, i * 2));
auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2));
std::memcpy(&res[i * 2], &partial_res, sizeof(vec<half, 2>));
}
if (N % 2) {
res[N - 1] = __sycl_std::__invoke_exp2<half>(x[N - 1]);
res[N - 1] = FUNC(x[N - 1]);
}
#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
#undef FUNC_VEC
#undef FUNC
return res;
}

Expand Down