diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index 4a47ee960ac0f..19602e65c4d00 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -450,6 +450,13 @@ struct mptr_or_vec_elem_type> template using mptr_or_vec_elem_type_t = typename mptr_or_vec_elem_type::type; +template +using cl_unsigned = std::conditional_t< + Size == 1, opencl::cl_uchar, + std::conditional_t< + Size == 2, opencl::cl_ushort, + std::conditional_t>>; + // select_apply_cl_scalar_t selects from T8/T16/T32/T64 basing on // sizeof(IN). expected to handle scalar types. template diff --git a/sycl/include/sycl/detail/helpers.hpp b/sycl/include/sycl/detail/helpers.hpp index ad173f45fa6e6..062d8eeb94a82 100644 --- a/sycl/include/sycl/detail/helpers.hpp +++ b/sycl/include/sycl/detail/helpers.hpp @@ -249,6 +249,16 @@ template void loop(F &&f) { loop_impl(std::make_index_sequence{}, std::forward(f)); } +template +void loop_unroll_up_to(F &&f) { + if constexpr (count > limit) + for (size_t i = 0; i < count; ++i) + f(i); + else + loop([&](auto i) { f(i); }); +} + +inline constexpr bool is_power_of_two(int x) { return (x & (x - 1)) == 0; } } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp index 2e229cb0de69d..50aa869942eb6 100644 --- a/sycl/include/sycl/detail/type_traits.hpp +++ b/sycl/include/sycl/detail/type_traits.hpp @@ -33,6 +33,8 @@ template struct is_group_helper : std::false_type {}; template struct is_group_helper> : std::true_type { }; +template +inline constexpr bool is_group_helper_v = is_group_helper::value; } // namespace detail } // namespace experimental } // namespace ext::oneapi @@ -52,6 +54,8 @@ template struct is_generic_group : std::integral_constant::value || is_sub_group::value> {}; +template +inline constexpr bool is_generic_group_v = is_generic_group::value; namespace half_impl { class half; @@ -280,6 +284,17 @@ struct is_vector_bool template struct is_bool : bool_constant>::value> {}; +// is_multi_ptr +template struct is_multi_ptr_impl : public std::false_type {}; + +template +struct is_multi_ptr_impl> + : public std::true_type {}; + +template +constexpr bool is_multi_ptr_v = is_multi_ptr_impl>::value; + // is_pointer template struct is_pointer_impl : std::false_type {}; diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp new file mode 100644 index 0000000000000..2d1a5756fab09 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -0,0 +1,1006 @@ +//==---- group_load_store.hpp --- SYCL extension for group loads/stores ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Implements https://github.com/intel/llvm/pull/7593 + +#pragma once + +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace ext::oneapi::experimental { + +// TODO: Should that go into other place (shared between different extensions)? +enum class group_algorithm_data_placement { blocked, striped }; + +namespace property { +struct data_placement_key { + template + using value_t = + property_value(Placement)>>; +}; + +template +inline constexpr data_placement_key::value_t data_placement; + +// TODO: Include into the extension spec or remove. If latter, we'd probably +// introduce internal macro to make the same assumption globaly for the testing +// purposes. +struct full_sg_key { + using value_t = property_value; +}; + +inline constexpr full_sg_key::value_t full_sg; +} // namespace property + +template <> +struct is_property_key : std::true_type {}; + +template <> struct is_property_key : std::true_type {}; + +namespace detail { +using namespace sycl::detail; + +template <> struct PropertyToKind { + static constexpr PropKind Kind = PropKind::DataPlacement; +}; +template <> +struct IsCompileTimeProperty : std::true_type {}; + +template <> struct PropertyToKind { + static constexpr PropKind Kind = PropKind::FullSG; +}; +template <> +struct IsCompileTimeProperty : std::true_type {}; + +// Implementation helpers. + +#ifdef __SYCL_DEVICE_ONLY__ +template +constexpr bool is_blocked(Properties properties) { + namespace property = ext::oneapi::experimental::property; + if constexpr (properties + .template has_property()) + return properties.template get_property() == + property::data_placement< + ext::oneapi::experimental::group_algorithm_data_placement:: + blocked>; + else + return true; +} + +template +int get_mem_idx(GroupTy g, int vec_or_array_idx) { + if constexpr (IsBlocked) + return g.get_local_linear_id() * VEC_OR_ARRAY_SIZE + vec_or_array_idx; + else + return g.get_local_linear_id() + + g.get_local_linear_range() * vec_or_array_idx; +} + +template +inline constexpr bool no_shuffle_impl_available = + is_power_of_two(type_size) && type_size <= 8 && + (!blocked || (is_power_of_two(ElementsPerWorkItem) && + (ElementsPerWorkItem <= 8 || + (ElementsPerWorkItem == 16 && type_size == 1)))); + +// A helper for group_load/store implementations outlining the common logic: +// +// - fallback to "generic" if block loads/stores aren't supported (non SPIRV +// device, needs polishing though). +// - fallback to "generic" if not properly aligned. +// - fallback to "generic" if iterator isn't a pointer/multi_ptr. +// - fallback to "generic" if there are masked-out SIMD lanes (i.e., +// sg.get_local_range() isn't equal to SIMD width). +// - ensure multi_ptr is "delegated" as a plain annotated pointer. +// - use dynamic address space cast for the pointers in generic space then +// either "delegate" or fallback to "generic". +// - finally, if the pointer is in the global address space and satisfies +// alignment conditions, use "impl" callback to perform optimized load/store. +// +// Note that the tests for the functionality assume existence of this helper to +// avoid combinatorial explosion of scenarios to test. +template +void dispatch_ptr(GroupHelper gh, IteratorT iter, GenericTy generic, + DelegateTy delegate, ImplTy impl) { + auto group = [&]() { + if constexpr (ext::oneapi::experimental::detail::is_group_helper_v< + GroupHelper>) + return gh.get_group(); + else + return gh; + }(); + using GroupTy = decltype(group); + + using value_type = + remove_decoration_t::value_type>; + using iter_no_cv = std::remove_cv_t; + +#if defined(__SPIR__) + constexpr bool is_spir = true; +#else + constexpr bool is_spir = false; +#endif + + if constexpr (!is_spir || unsupported) { + return generic(); + } else if constexpr (detail::is_multi_ptr_v) { + return delegate(iter.get_decorated()); + } else if constexpr (!std::is_pointer_v) { + return generic(); + } else { + // TODO: Handle annotated_ptr? + if constexpr (alignof(value_type) < required_align) { + if ((reinterpret_cast(iter) % required_align) != 0) { + return generic(); + } + } + + if constexpr (!assume_full_sg) { + if constexpr (detail::is_sub_group::value) { + if (group.get_local_range() != group.get_max_local_range()) + // Sub-group is not "full". + return generic(); + } else if constexpr (detail::is_group::value) { + // TODO: Use get_child_group from + // sycl_ext_oneapi_root_group extension once it is + // implemented instead of this free function. + auto ndi = sycl::ext::oneapi::experimental::this_nd_item< + GroupTy::dimensions>(); + auto sg = ndi.get_sub_group(); + + auto wg_size = group.get_local_range().size(); + auto simd_width = sg.get_max_local_range().size(); + + if (wg_size % simd_width != 0) { + return generic(); + } + } + } + + // TODO: verify that sizeof * sg.get_max_local_range() * + // sg_offset_per_wi is a multiple of required alignment for + // detail::is_group. + + constexpr auto AS = detail::deduce_AS::value; + if constexpr (AS == access::address_space::global_space) { + // The only customization point - to be handled by the + // caller. + return impl(iter); + } else if constexpr (AS == access::address_space::generic_space) { + if (auto global_ptr = + __SYCL_GenericCastToPtrExplicit_ToGlobal(iter)) + return delegate(global_ptr); + else + return generic(); + } else { + return generic(); + } + } + __builtin_unreachable(); +} +#endif +} // namespace detail + +#ifdef __SYCL_DEVICE_ONLY__ +// Load API scalar. +template +std::enable_if_t< + std::is_convertible_v::value_type>, + OutputT> && + detail::is_generic_group_v> +group_load(Group g, InputIteratorT in_ptr, OutputT &out, + Properties properties = {}) { + group_load(g, in_ptr, span(&out, 1), properties); +} + +// Store API scalar. +template +std::enable_if_t::value_type>> && + detail::is_generic_group_v> +group_store(Group g, const InputT &in, OutputIteratorT out_ptr, + Properties properties = {}) { + group_store(g, span(&in, 1), out_ptr, properties); +} + +// Load API sycl::vec overload. +template +std::enable_if_t< + std::is_convertible_v::value_type>, + OutputT> && + detail::is_generic_group_v> +group_load(Group g, InputIteratorT in_ptr, sycl::vec &out, + Properties properties = {}) { + group_load(g, in_ptr, span(&out[0], N), properties); +} + +// Store API sycl::vec overload. +template +std::enable_if_t::value_type>> && + detail::is_generic_group_v> +group_store(Group g, const sycl::vec &in, OutputIteratorT out_ptr, + Properties properties = {}) { + group_store(g, span(&in[0], N), out_ptr, properties); +} + +// Load API span + group/sub_group overload. +template +std::enable_if_t< + std::is_convertible_v::value_type>, + OutputT> && + detail::is_generic_group_v> +group_load(GroupHelper gh, InputIteratorT in_ptr, + span out, Properties properties = {}) { + constexpr bool blocked = detail::is_blocked(properties); + + using value_type = remove_decoration_t< + typename std::iterator_traits::value_type>; + + // See std::enable_if_t above restricting this implementation. + using GroupTy = GroupHelper; + auto g = gh; + + auto generic = [&]() { + group_barrier(g); + detail::loop_unroll_up_to([&](size_t i) { + auto idx = detail::get_mem_idx(g, i); + out[i] = in_ptr[idx]; + }); + group_barrier(g); + }; + + auto delegate = [&](auto unwrapped_ptr) { + group_load(gh, unwrapped_ptr, out, properties); + }; + + constexpr int BlockSize = + sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); + constexpr int NumBlocks = blocked ? 1 : ElementsPerWorkItem; + + constexpr auto hw_block_size = [&]() { + size_t size = 8; + while (BlockSize % size != 0) + size /= 2; + return size; + }(); + using HWBlockTy = detail::cl_unsigned; + + constexpr const size_t hw_blocks_per_block = BlockSize / hw_block_size; + + auto impl_sg = [&](sub_group sg, auto *in_ptr) { + value_type v[ElementsPerWorkItem]; + + auto priv_ptr = reinterpret_cast(&v); + + // Needs to be 4 bytes aligned (16 for writes). + + // Native is strided! + // Available native HWBlockSizes: uchar, ushort, uint, ulong (1, 2, 4, 8). + // Available native NumHWBlocks: + // 1, 2, 4, 8, or 16 uchars + // 1, 2, 4, or 8 ulongs/uints/ushorts + + size_t sg_lid = sg.get_local_linear_id(); + size_t sg_size = sg.get_max_local_range().size(); // Assume "full" SG. + + // We selected "HWBlockTy" such that sizeof(Block) % sizeof(HWBlockTy) == 0. + + // s -> SG, w-> WI, b->hw block + // + // memory reads: + // | s0.w0.b0 | s0.w1.b0 | s0.w2.b0 | ... | sN.wS.bV | sN.wS.bV | + // + // After the read, we need to rearrange data between work items and + // write it onto each work item's own destination. For example: + // + // Idx\WI | 0 | 1 | + // 0 | s0.w0.b0 | s0.w0.b2 | + // 1 | s0.w1.b0 | s0.w1.b2 | + // 2 | s0.w0.b1 | s0.w0.b3 | + // 3 | s0.w1.b1 | s0.w1.b3 | + // +----------+----------+ + // 0 | s0.w0.b4 | s0.w0.b6 | + // 1 | s0.w1.b4 | s0.w1.b6 | + // 2 | s0.w0.b5 | s0.w0.b7 | + // 3 | s0.w1.b5 | s0.w1.b7 | + + size_t cur_hw_blocks_start_idx = 0; + + size_t cur_write_index = 0; + + // Index to the memory pointed to by the incoming argument. + size_t needed_global_idx = sg_lid * hw_blocks_per_block; + + // select next vec_size for the load. + // 1 == 2^0, 16 == 2^4 + constexpr size_t max_vec_pwr_of_two = hw_block_size == 1 ? 4 : 3; + detail::loop([&](auto i) { + // Use bigger sizes first. + constexpr int vec_size = 1 << (max_vec_pwr_of_two - i); + + constexpr auto iterations = + i == 0 + ? hw_blocks_per_block * NumBlocks / vec_size + : (hw_blocks_per_block * NumBlocks % (vec_size * 2)) / vec_size; + + detail::loop_unroll_up_to([&](auto) { + const size_t hw_blocks_per_iter = sg_size * vec_size; + + using LoadT = std::conditional_t< + vec_size == 1, HWBlockTy, + detail::ConvertToOpenCLType_t>>; + + using PtrT = typename detail::DecoratedType< + HWBlockTy, access::address_space::global_space>::type *; + LoadT load = __spirv_SubgroupBlockReadINTEL( + reinterpret_cast(in_ptr) + cur_hw_blocks_start_idx); + + if constexpr (hw_blocks_per_block == 1) { + std::memcpy(priv_ptr + cur_write_index * hw_block_size, &load, + sizeof(load)); + cur_write_index += vec_size; + needed_global_idx += hw_blocks_per_block; + } else if constexpr (detail::is_power_of_two(hw_blocks_per_block) && + vec_size >= hw_blocks_per_block) { + // Idx\WI | 0 | 1 | + // 0 | s0.w0.b0 | s0.w0.b2 | + // 1 | s0.w1.b0 | s0.w1.b2 | + // 2 | s0.w0.b1 | s0.w0.b3 | + // 3 | s0.w1.b1 | s0.w1.b3 | + + // Idx\WI | 0 | 1 | + // 0 | s0.w0.b0 | s0.w0.b2 | + // 1 | s0.w1.b0 | s0.w1.b2 | + // 2 | s0.w0.b1 | s0.w0.b3 | + // 3 | s0.w1.b1 | s0.w1.b3 | + // +----------+----------+ + // 0 | s0.w0.b4 | s0.w0.b6 | + // 1 | s0.w1.b4 | s0.w1.b6 | + // 2 | s0.w0.b5 | s0.w0.b7 | + // 3 | s0.w1.b5 | s0.w1.b7 | + detail::loop([&](auto i) { + size_t BlockIdx = i / hw_blocks_per_block; + size_t block_idx = i % hw_blocks_per_block; + size_t idx = sg_lid * hw_blocks_per_block + block_idx + + BlockIdx * sg_size * hw_blocks_per_block; + + size_t wi = idx % sg_size; + size_t block = idx / sg_size; + + auto val = select_from_group(sg, load, wi)[block]; + std::memcpy(priv_ptr + i * hw_block_size, &val, hw_block_size); + }); + } else { + // TODO: Verify that those shuffles are worth doing at all. + // clang-format off + // Idx\WI | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | + // 0 | s0.w0.b0 | s0.w7.b0 | s0.w6.b1 | s0.w5.b2 | s0.w4.b3 | s1.w3.b0 | s1.w2.b1 | s1.w1.b2 | + // 1 | s0.w1.b0 | s0.w0.b1 | s0.w7.b1 | s0.w6.b2 | s0.w5.b3 | s1.w4.b0 | s1.w3.b1 | s1.w2.b2 | + // 2 | s0.w2.b0 | s0.w1.b1 | s0.w0.b2 | s0.w7.b2 | s0.w6.b3 | s1.w5.b0 | s1.w4.b1 | s1.w3.b2 | + // 3 | s0.w3.b0 | s0.w2.b1 | s0.w1.b2 | s0.w0.b3 | s0.w7.b3 | s1.w6.b0 | s1.w5.b1 | s1.w4.b2 | + // 4 | s0.w4.b0 | s0.w3.b1 | s0.w2.b2 | s0.w1.b3 | s1.w0.b0 | s1.w7.b0 | s1.w6.b1 | s1.w5.b2 | + // 5 | s0.w5.b0 | s0.w4.b1 | s0.w3.b2 | s0.w2.b3 | s1.w1.b0 | s1.w0.b1 | s1.w7.b1 | s1.w6.b2 | + // 6 | s0.w6.b0 | s0.w5.b1 | s0.w4.b2 | s0.w3.b3 | s1.w2.b0 | s1.w1.b1 | s1.w0.b2 | s1.w7.b2 | + // ------- +----------+----------+----------+----------+----------+----------+----------+----------+ + // 0 | s1.w0.b3 | s1.w7.b3 | s2.w6.b0 | s2.w5.b1 | s2.w4.b2 | s2.w3.b3 | + // 1 | s1.w1.b3 | s2.w0.b0 | s2.w7.b0 | s2.w6.b1 | s2.w5.b2 | s2.w4.b3 | + // 2 | s1.w2.b3 | s2.w1.b0 | s2.w0.b1 | s2.w7.b1 | s2.w6.b2 | s2.w5.b3 | + // 3 | s1.w3.b3 | s2.w2.b0 | s2.w1.b1 | s2.w0.b2 | s2.w7.b2 | s2.w6.b3 | + // 4 | s1.w4.b3 | s2.w3.b0 | s2.w2.b1 | s2.w1.b2 | s2.w0.b3 | s2.w7.b3 | + // 5 | s1.w5.b3 | s2.w4.b0 | s2.w3.b1 | s2.w2.b2 | s2.w1.b3 | Remainder, 16 elems + // 6 | s1.w6.b3 | s2.w5.b0 | s2.w4.b1 | s2.w3.b2 | s2.w2.b3 | + // clang-format on + while (true) { + int needed_idx = needed_global_idx - cur_hw_blocks_start_idx; + + int wi = needed_idx % sg_size; + int block = needed_idx / sg_size; + + // Shuffle has to be in the convergent control flow. + auto val = select_from_group(sg, load, wi); + + bool write_needed = needed_global_idx >= cur_hw_blocks_start_idx && + needed_global_idx < cur_hw_blocks_start_idx + + hw_blocks_per_iter; + + if (none_of_group(sg, write_needed)) + break; + + if (write_needed) { + std::memcpy(priv_ptr + cur_write_index * hw_block_size, + reinterpret_cast(&val) + block, + hw_block_size); + ++cur_write_index; + needed_global_idx += + cur_write_index % hw_blocks_per_block == 0 + ? 1 + sg_size * hw_blocks_per_block - hw_blocks_per_block + : 1; + } + }; + } + + cur_hw_blocks_start_idx += hw_blocks_per_iter; + }); + }); + + // Now perform the required implicit conversion. + detail::loop_unroll_up_to( + [&](size_t i) { out[i] = v[i]; }); + }; + + auto impl = [&](auto *in_ptr) { + group_barrier(g); + if constexpr (detail::is_sub_group::value) { + return impl_sg(g, in_ptr); + } else { + // TODO: Use get_child_group from sycl_ext_oneapi_root_group extension + // once it is implemented instead of this free function. + auto ndi = + sycl::ext::oneapi::experimental::this_nd_item(); + auto sg = ndi.get_sub_group(); + if constexpr (blocked) { + return impl_sg(sg, in_ptr + sg.get_group_id() * + sg.get_max_local_range() * + ElementsPerWorkItem); + } else { + // For striped layout the stride between elements in a vector is + // expressed in terms of WG's size, not SG. As such, each index has + // to be implemented using scalar SG block_load. + auto vec_elem_stride = g.get_local_linear_range(); + detail::loop_unroll_up_to([&](size_t i) { + OutputT scalar; + group_load(sg, + in_ptr + sg.get_group_id() * sg.get_max_local_range() + + vec_elem_stride * i, + scalar, properties); + out[i] = scalar; + }); + } + } + group_barrier(g); + }; + + constexpr bool assume_full_sg = + properties.template has_property(); + // We'd need too much private memory. + constexpr bool unsupported = hw_blocks_per_block > 16; + detail::dispatch_ptr<4 /* read align in bytes */, assume_full_sg, + unsupported>(g, in_ptr, generic, delegate, impl); +} + +// Load API span + group_helper overload. +template +std::enable_if_t< + std::is_convertible_v::value_type>, + OutputT> && + is_group_helper_v> +group_load(GroupHelper gh, InputIteratorT in_ptr, + span out, Properties properties) { + constexpr bool blocked = detail::is_blocked(properties); + + using value_type = remove_decoration_t< + typename std::iterator_traits::value_type>; + + auto g = gh.get_group(); + using GroupTy = decltype(g); + + if constexpr (detail::no_shuffle_impl_available< + sizeof(value_type), ElementsPerWorkItem, blocked>) { + return group_load(g, in_ptr, out, properties); + } else { + constexpr bool is_sg = detail::is_sub_group::value; + + auto generic = [&]() { + group_barrier(g); + for (int i = 0; i < ElementsPerWorkItem; ++i) + out[i] = + in_ptr[detail::get_mem_idx(g, i)]; + group_barrier(g); + }; + + auto delegate = [&](auto unwrapped_ptr) { + group_load(gh, unwrapped_ptr, out, properties); + }; + + constexpr int BlockSize = + sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); + constexpr int NumBlocks = blocked ? 1 : ElementsPerWorkItem; + + constexpr auto hw_block_size = [&]() { + size_t size = 8; + while (BlockSize % size != 0) + size /= 2; + return size; + }(); + + using HWBlockTy = detail::cl_unsigned; + + constexpr const size_t hw_blocks_per_block = BlockSize / hw_block_size; + + auto impl = [&](auto *in_ptr) { + auto sg = [&]() { + if constexpr (is_sg) + return g; + else { + // TODO: Use get_child_group from + // sycl_ext_oneapi_root_group extension once it is + // implemented instead of this free function. + auto ndi = sycl::ext::oneapi::experimental::this_nd_item< + GroupTy::dimensions>(); + return ndi.get_sub_group(); + } + }(); + auto sg_lid = sg.get_local_linear_id(); + size_t sg_size = sg.get_max_local_range().size(); + size_t g_size = g.get_local_linear_range(); + + group_barrier(g); + auto scratch_span = gh.get_memory(); + // select next vec_size for the load. + // 1 == 2^0, 16 == 2^4 + constexpr size_t max_vec_pwr_of_two = hw_block_size == 1 ? 4 : 3; + + size_t cur_hw_blocks_start_idx = 0; + + detail::loop([&](auto i) { + // Use bigger sizes first. + constexpr int vec_size = 1 << (max_vec_pwr_of_two - i); + + constexpr auto iterations = + i == 0 + ? hw_blocks_per_block * NumBlocks / vec_size + : (hw_blocks_per_block * NumBlocks % (vec_size * 2)) / vec_size; + detail::loop_unroll_up_to([&](auto) { + const size_t hw_blocks_per_iter = + g.get_local_linear_range() * vec_size; + + using LoadT = std::conditional_t< + vec_size == 1, HWBlockTy, + detail::ConvertToOpenCLType_t>>; + + using PtrT = typename detail::DecoratedType< + HWBlockTy, access::address_space::global_space>::type *; + auto this_sg_offset = cur_hw_blocks_start_idx; + if constexpr (!is_sg) { + this_sg_offset += sg.get_group_id() * vec_size * sg_size; + } + LoadT load = __spirv_SubgroupBlockReadINTEL( + reinterpret_cast(in_ptr) + this_sg_offset); + + detail::loop([&](auto idx) { + // Operate in terms of a low-level "block" (HWBlockTy). + auto mem_idx = + this_sg_offset + idx * sg.get_local_linear_range() + sg_lid; + + auto BlockN = mem_idx / (g_size * hw_blocks_per_block); + auto InBlockIdx = mem_idx % (g_size * hw_blocks_per_block); + + auto result_wi = InBlockIdx / hw_blocks_per_block; + auto result_idx = InBlockIdx % hw_blocks_per_block; + auto scratch_idx = result_wi * hw_blocks_per_block * NumBlocks + + BlockN * hw_blocks_per_block + result_idx; + std::memcpy(scratch_span.data() + hw_block_size * scratch_idx, + reinterpret_cast(&load) + idx * hw_block_size, + hw_block_size); + }); + + cur_hw_blocks_start_idx += hw_blocks_per_iter; + }); + }); + group_barrier(g); + auto scratch_idx = + g.get_local_linear_id() * sizeof(value_type) * ElementsPerWorkItem; + std::memcpy(out.data(), scratch_span.data() + scratch_idx, + sizeof(value_type) * ElementsPerWorkItem); + group_barrier(g); + }; + + constexpr bool assume_full_sg = + properties.template has_property(); + detail::dispatch_ptr<4 /* read align in bytes */, assume_full_sg, + false /* unsupported */>(g, in_ptr, generic, delegate, + impl); + } +} + +// Store API span + group/sub_group overload. +template +std::enable_if_t::value_type>> && + detail::is_generic_group_v> +group_store(GroupHelper gh, const span in, + OutputIteratorT out_ptr, Properties properties = {}) { + constexpr bool blocked = detail::is_blocked(properties); + + using value_type = remove_decoration_t< + typename std::iterator_traits::value_type>; + + using GroupTy = GroupHelper; + auto g = gh; + + auto generic = [&]() { + group_barrier(g); + for (int i = 0; i < in.size(); ++i) + out_ptr[detail::get_mem_idx(gh, i)] = in[i]; + group_barrier(g); + }; + + auto delegate = [&](auto unwrapped_ptr) { + group_store(g, in, unwrapped_ptr, properties); + }; + + constexpr int BlockSize = + sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); + constexpr int NumBlocks = blocked ? 1 : ElementsPerWorkItem; + + constexpr auto hw_block_size = [&]() { + size_t size = 8; + while (BlockSize % size != 0) + size /= 2; + return size; + }(); + using HWBlockTy = detail::cl_unsigned; + + constexpr const size_t hw_blocks_per_block = BlockSize / hw_block_size; + + auto impl_sg = [&](sub_group sg, auto *out_ptr) { + value_type v[ElementsPerWorkItem]; + // Perform the required implicit conversion first. + detail::loop_unroll_up_to( + [&](size_t i) { v[i] = in[i]; }); + + auto priv_ptr = reinterpret_cast(&v); + + // Needs to be 16 bytes aligned (4 for reads). + + // Native is strided! + // Available native BlockSizes: uchar, ushort, uint, ulong (1, 2, 4, 8). + // Available native NumBlocks: + // 1, 2, 4, 8, or 16 uchars + // 1, 2, 4, or 8 ulongs/uints/ushorts + + size_t sg_lid = sg.get_local_linear_id(); + size_t sg_size = sg.get_max_local_range().size(); // Assume "full" SG. + + size_t cur_hw_blocks_start_idx = 0; + + size_t cur_read_index = 0; + + // select next vec_size for the load. + // 1 == 2^0, 16 == 2^4 + constexpr size_t max_vec_pwr_of_two = hw_block_size == 1 ? 4 : 3; + detail::loop([&](auto i) { + // Use bigger sizes first. + constexpr int vec_size = 1 << (max_vec_pwr_of_two - i); + + constexpr auto iterations = + i == 0 + ? hw_blocks_per_block * NumBlocks / vec_size + : (hw_blocks_per_block * NumBlocks % (vec_size * 2)) / vec_size; + + detail::loop_unroll_up_to([&](auto) { + const size_t hw_blocks_per_iter = sg_size * vec_size; + + using StoreT = std::conditional_t< + vec_size == 1, HWBlockTy, + detail::ConvertToOpenCLType_t>>; + + using PtrT = typename detail::DecoratedType< + HWBlockTy, access::address_space::global_space>::type *; + StoreT store_val; + + if constexpr (hw_blocks_per_block == 1) { + std::memcpy(&store_val, priv_ptr + cur_read_index * hw_block_size, + sizeof(store_val)); + cur_read_index += vec_size; + } else if constexpr (detail::is_power_of_two(hw_blocks_per_block) && + vec_size >= hw_blocks_per_block) { + // Idx\WI | 0 | 1 | + // 0 | s0.w0.b0 | s0.w0.b2 | + // 1 | s0.w1.b0 | s0.w1.b2 | + // 2 | s0.w0.b1 | s0.w0.b3 | + // 3 | s0.w1.b1 | s0.w1.b3 | + + // reverse mapping: + + // SG.Idx\WI | 0 | 1 | + // s0.b0 | w0.0 | w0.1 | + // s0.b1 | w0.2 | w0.3 | + // s0.b2 | w1.0 | w1.1 | + // s0.b3 | w1.2 | w1.3 | + + // Idx\WI | 0 | 1 | + // 0|.0 | s0.w0.b0 | s0.w0.b2 | + // |.1 | s0.w1.b0 | s0.w1.b2 | + // |.2 | s0.w0.b1 | s0.w0.b3 | + // |.3 | s0.w1.b1 | s0.w1.b3 | + // +----------+----------+ + // 1|.0 | s0.w0.b4 | s0.w0.b6 | + // |.1 | s0.w1.b4 | s0.w1.b6 | + // |.2 | s0.w0.b5 | s0.w0.b7 | + // |.3 | s0.w1.b5 | s0.w1.b7 | + + // reverse mapping: + + // SG.Idx\WI | 0 | 1 | + // s0.b0 | w0.0.0 | w0.0.1 | + // s0.b1 | w0.0.2 | w0.0.3 | + // s0.b2 | w1.0.0 | w1.0.1 | + // s0.b3 | w1.0.2 | w1.0.3 | + // s0.b4 | w0.1.0 | w0.1.1 | + // s0.b5 | w0.1.2 | w0.1.3 | + // s0.b6 | w1.1.0 | w1.1.1 | + // s0.b7 | w1.1.2 | w1.1.3 | + + // Idx\WI | 0 | 1 | 2 | 3 | + // 0 | s0.w0.b0 | s0.w2.b0 | s0.w0.b1 | s0.w2.b1 | + // 1 | s0.w1.b0 | s0.w3.b0 | s0.w1.b1 | s0.w3.b1 | + + // reverse mapping: + // SG.Idx\WI | 0 | 1 | 2 | 3 | + // s0.b0 | w0.0 | w0.1 | w1.0 | w1.1 | + // s0.b1 | w2.0 | w2.1 | w3.0 | w3.1 | + + detail::loop([&](auto i) { + size_t idx = i * sg_size + sg_lid; + + size_t block_idx = idx % hw_blocks_per_block; + size_t wi = (idx / hw_blocks_per_block) % sg_size; + size_t BlockIdx = i / hw_blocks_per_block; // uniform + + HWBlockTy Block[hw_blocks_per_block]; + std::memcpy(&Block, + reinterpret_cast(v) + sizeof(Block) * BlockIdx, + sizeof(Block)); + + HWBlockTy ShuffledBlock[hw_blocks_per_block]; + // TODO: Report a bug? + // undefined reference to + // `__builtin_spirv_OpSubgroupShuffleINTEL_v2i64_i32' + detail::loop_unroll_up_to([&](auto i) { + ShuffledBlock[i] = select_from_group(sg, Block[i], wi); + }); + HWBlockTy val = ShuffledBlock[block_idx]; + + std::memcpy(reinterpret_cast(&store_val) + + i * hw_block_size, + &val, hw_block_size); + }); + } else { + // See "unsupported" below in detail::dispatch_ptr invocation. + static_assert(hw_blocks_per_block == 0, + "Should have bailed out earlier!"); + } + + __spirv_SubgroupBlockWriteINTEL(reinterpret_cast(out_ptr) + + cur_hw_blocks_start_idx, + store_val); + + cur_hw_blocks_start_idx += hw_blocks_per_iter; + }); + }); + }; + + auto impl = [&](auto *out_ptr) { + group_barrier(g); + if constexpr (detail::is_sub_group::value) { + return impl_sg(g, out_ptr); + } else { + // TODO: Use get_child_group from sycl_ext_oneapi_root_group extension + // once it is implemented instead of this free function. + auto ndi = + sycl::ext::oneapi::experimental::this_nd_item(); + auto sg = ndi.get_sub_group(); + if constexpr (blocked) { + return impl_sg(sg, out_ptr + sg.get_group_id() * + sg.get_max_local_range() * + ElementsPerWorkItem); + } else { + // For striped layout the stride between elements in a vector is + // expressed in terms of WG's size, not SG. As such, each index has + // to be implemented using scalar SG block load. + auto vec_elem_stride = g.get_local_linear_range(); + detail::loop_unroll_up_to([&](size_t i) { + value_type scalar = in[i]; // implicit conversion. + group_store(sg, scalar, + out_ptr + sg.get_group_id() * sg.get_max_local_range() + + vec_elem_stride * i, + properties); + }); + } + } + group_barrier(g); + }; + + constexpr bool assume_full_sg = + properties.template has_property(); + constexpr bool unsupported = + !detail::is_power_of_two(hw_blocks_per_block) || + hw_blocks_per_block > 16 || + (hw_blocks_per_block == 16 && hw_block_size != 1); + detail::dispatch_ptr<16 /* read align in bytes */, assume_full_sg, + unsupported>(g, out_ptr, generic, delegate, impl); +} + +// Store API span + group_helper overload. +template +std::enable_if_t::value_type>> && + is_group_helper_v> +group_store(GroupHelper gh, const span in, + OutputIteratorT out_ptr, Properties properties = {}) { + constexpr bool blocked = detail::is_blocked(properties); + + using value_type = remove_decoration_t< + typename std::iterator_traits::value_type>; + + auto g = gh.get_group(); + using GroupTy = decltype(g); + + if constexpr (detail::no_shuffle_impl_available< + sizeof(value_type), ElementsPerWorkItem, blocked>) { + return group_store(g, in, out_ptr, properties); + } else { + constexpr bool is_sg = detail::is_sub_group::value; + + auto generic = [&]() { + group_barrier(g); + for (int i = 0; i < in.size(); ++i) + out_ptr[detail::get_mem_idx(g, i)] = + in[i]; + group_barrier(g); + }; + + auto delegate = [&](auto unwrapped_ptr) { + group_store(gh, in, unwrapped_ptr, properties); + }; + + constexpr int BlockSize = + sizeof(value_type) * (blocked ? ElementsPerWorkItem : 1); + constexpr int NumBlocks = blocked ? 1 : ElementsPerWorkItem; + + constexpr auto hw_block_size = [&]() { + size_t size = 8; + while (BlockSize % size != 0) + size /= 2; + return size; + }(); + + using HWBlockTy = detail::cl_unsigned; + + constexpr const size_t hw_blocks_per_block = BlockSize / hw_block_size; + + auto impl = [&](auto *out_ptr) { + auto sg = [&]() { + if constexpr (is_sg) + return g; + else { + // TODO: Use get_child_group from + // sycl_ext_oneapi_root_group extension once it is + // implemented instead of this free function. + auto ndi = sycl::ext::oneapi::experimental::this_nd_item< + GroupTy::dimensions>(); + return ndi.get_sub_group(); + } + }(); + auto sg_lid = sg.get_local_linear_id(); + auto g_lid = g.get_local_linear_id(); + size_t sg_size = sg.get_max_local_range().size(); + size_t g_size = g.get_local_linear_range(); + + group_barrier(g); + auto scratch_span = gh.get_memory(); + + for (int elem = 0; elem < NumBlocks; ++elem) { + for (int block = 0; block < hw_blocks_per_block; ++block) { + auto total_order_idx = + blocked ? block + elem * hw_blocks_per_block + + g_lid * hw_blocks_per_block * NumBlocks + : block + g_lid * hw_blocks_per_block + + elem * hw_blocks_per_block * g_size; + std::memcpy(scratch_span.data() + hw_block_size * total_order_idx, + reinterpret_cast(&in[elem]) + + hw_block_size * block, + hw_block_size); + } + } + group_barrier(g); + // select next vec_size for the load. + // 1 == 2^0, 16 == 2^4 + constexpr size_t max_vec_pwr_of_two = hw_block_size == 1 ? 4 : 3; + + size_t cur_hw_blocks_start_idx = 0; + + detail::loop([&](auto i) { + // Use bigger sizes first. + constexpr int vec_size = 1 << (max_vec_pwr_of_two - i); + + constexpr auto iterations = + i == 0 + ? hw_blocks_per_block * NumBlocks / vec_size + : (hw_blocks_per_block * NumBlocks % (vec_size * 2)) / vec_size; + detail::loop_unroll_up_to([&](auto) { + const size_t hw_blocks_per_iter = + g.get_local_linear_range() * vec_size; + using StoreT = std::conditional_t< + vec_size == 1, HWBlockTy, + detail::ConvertToOpenCLType_t>>; + using PtrT = typename detail::DecoratedType< + HWBlockTy, access::address_space::global_space>::type *; + auto this_sg_offset = cur_hw_blocks_start_idx; + if constexpr (!is_sg) { + this_sg_offset += sg.get_group_id() * vec_size * sg_size; + } + + StoreT tmp; + for (int i = 0; i < vec_size; ++i) { + std::memcpy(reinterpret_cast(&tmp) + i * hw_block_size, + scratch_span.data() + + (this_sg_offset + sg_lid + i * sg_size) * + hw_block_size, + hw_block_size); + } + + __spirv_SubgroupBlockWriteINTEL( + reinterpret_cast(out_ptr) + this_sg_offset, tmp); + cur_hw_blocks_start_idx += hw_blocks_per_iter; + }); + }); + + group_barrier(g); + }; + + constexpr bool assume_full_sg = + properties.template has_property(); + detail::dispatch_ptr<16 /* write align in bytes */, assume_full_sg, + false /* unsupported */>(g, out_ptr, generic, delegate, + impl); + } +} +#else +template void group_load(Args...) { + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "Group loads/stores are not supported on host."); +} +template void group_store(Args...) { + throw sycl::exception( + std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()), + "Group loads/stores are not supported on host."); +} +#endif +} // namespace ext::oneapi::experimental +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 7c2e3063ace13..9305d6f068be1 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -195,8 +195,10 @@ enum PropKind : uint32_t { ReadyLatency = 29, UsesReady = 30, UsesValid = 31, + DataPlacement = 32, + FullSG = 33, // PropKindSize must always be the last value. - PropKindSize = 32, + PropKindSize = 34, }; // This trait must be specialized for all properties and must have a unique diff --git a/sycl/include/sycl/range.hpp b/sycl/include/sycl/range.hpp index f8d8860ba8706..54bad053a0945 100644 --- a/sycl/include/sycl/range.hpp +++ b/sycl/include/sycl/range.hpp @@ -49,9 +49,7 @@ template class range : public detail::array { size_t size() const { size_t size = 1; - for (int i = 0; i < dimensions; ++i) { - size *= this->get(i); - } + detail::loop([&](auto i) { size *= this->get(i); }); return size; } diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 4e243e56ff52f..bf9b0faff0bac 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/test-e2e/GroupAlgorithm/group_load_store.cpp b/sycl/test-e2e/GroupAlgorithm/group_load_store.cpp new file mode 100644 index 0000000000000..ff2a2f5f538df --- /dev/null +++ b/sycl/test-e2e/GroupAlgorithm/group_load_store.cpp @@ -0,0 +1,215 @@ +// Use per-kernel split as a workaroud for a miscompilation bug in IGC. +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-device-code-split=per_kernel %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +#include + +#include +#include + +using namespace sycl; +namespace sycl_ext = sycl::ext::oneapi::experimental; + +constexpr int SG_SIZE = 16; +constexpr int N_WGS = 3; + +template +void test(size_t wg_size) { + std::cout << __PRETTY_FUNCTION__ << std::endl; + + size_t global_size = wg_size * N_WGS; + queue q; + + buffer b(global_size * ELEMS_PER_WI); + buffer result(global_size); + + q.submit([&](handler &cgh) { + accessor acc{b, cgh}; + accessor res{result, cgh}; + + static constexpr auto scratch_mem_per_wi = ELEMS_PER_WI * sizeof(MemType); + // sycl_ext::memory_required(scope, block_size); + local_accessor scratch(scratch_mem_per_wi * wg_size, cgh); + + cgh.parallel_for( + nd_range{range<1>{global_size}, range<1>{wg_size}}, + [=](nd_item<1> ndi) [[intel::reqd_sub_group_size(SG_SIZE)]] { + bool success = true; + + auto gid = ndi.get_global_id(0); + auto init = static_cast(gid) % + (1 << (std::min(sizeof(MemType), 4) * 8 - 2)); + auto *global_mem = + acc.template get_multi_ptr() + .get_decorated(); + auto *group_mem = + global_mem + ndi.get_group(0) * wg_size * ELEMS_PER_WI; + + auto *scratch_mem = &scratch[0]; + if constexpr (scope == memory_scope::sub_group) { + auto sg = ndi.get_sub_group(); + group_mem += sg.get_group_id() * SG_SIZE * ELEMS_PER_WI; + scratch_mem += sg.get_group_id() * SG_SIZE * scratch_mem_per_wi; + } + auto g = [&]() { + if constexpr (scope == memory_scope::sub_group) + return ndi.get_sub_group(); + else + return ndi.get_group(); + }(); + + auto group_lid = g.get_local_linear_id(); + + constexpr bool blocked = + data_placement == + sycl_ext::group_algorithm_data_placement::blocked; + + auto idx = [&](int elem_idx) { + if constexpr (blocked) { + return group_lid * ELEMS_PER_WI + elem_idx; + } else { + return group_lid + g.get_local_range().size() * elem_idx; + } + }; + + SpanType arr[ELEMS_PER_WI]; + auto s = span(arr, ELEMS_PER_WI); + + auto data_placement_prop = + sycl_ext::property::data_placement; + auto props = sycl_ext::properties(data_placement_prop); + + { + for (int i = 0; i < ELEMS_PER_WI; ++i) + group_mem[idx(i)] = init - i; + + group_load(g, group_mem, s, props); + + for (int i = 0; i < ELEMS_PER_WI; ++i) + success &= (s[i] == init - i); + } + + { + for (int i = 0; i < ELEMS_PER_WI; ++i) + s[i] = init - i + 1; + + group_store(g, s, group_mem, props); + + for (int i = 0; i < ELEMS_PER_WI; ++i) + success &= group_mem[idx(i)] == init - i + 1; + } + + sycl_ext::group_with_scratchpad gh{ + g, span(scratch_mem, + g.get_local_range().size() * scratch_mem_per_wi)}; + + { + for (int i = 0; i < ELEMS_PER_WI; ++i) + group_mem[idx(i)] = init - i; + + group_load(gh, group_mem, s, props); + + for (int i = 0; i < ELEMS_PER_WI; ++i) + success &= (s[i] == init - i); + } + + { + for (int i = 0; i < ELEMS_PER_WI; ++i) + s[i] = init - i + 1; + + group_store(gh, s, group_mem, props); + + for (int i = 0; i < ELEMS_PER_WI; ++i) + success &= group_mem[idx(i)] == init - i + 1; + } + + res[gid] = success; + }); + }); + + host_accessor res_acc{result}; + bool success = + std::all_of(res_acc.begin(), res_acc.end(), [](bool r) { return r; }); + if constexpr (true) + assert(success); + else + std::cout << "Test success: " << std::boolalpha << success << std::endl; +} + +struct S1 { + S1() = default; + S1(int i) : i(i) {} + operator int() { return i; } + void operator+=(int inc) { i += inc; } + int i = 0; + int j = 2; +}; +static_assert(sizeof(S1) == 8); + +struct S2 { + S2() = default; + S2(int i) : i(i) {} + operator int() { return i; } + void operator+=(int inc) { i += inc; } + int i = 0; + int j = 2; + int k = 3; +}; +static_assert(sizeof(S2) == 12); + +struct __attribute__((packed)) S3 { + S3() = default; + S3(int i) : i(i) {} + operator int() { return i; } + void operator+=(int inc) { i += inc; } + int i = 42; + int j = 2; + char k = 3; +}; +static_assert(sizeof(S3) == 9); + +template +void test_type_combo(size_t wg_size) { + constexpr auto blocked = sycl_ext::group_algorithm_data_placement::blocked; + constexpr auto striped = sycl_ext::group_algorithm_data_placement::striped; + constexpr auto sg = memory_scope::sub_group; + constexpr auto wg = memory_scope::work_group; + + test(wg_size); + test(wg_size); + test(wg_size); + test(wg_size); +} + +int main() { +#ifdef SINGLE + using T = char; + test(SG_SIZE * 3); +#else + size_t wg_sizes[] = {SG_SIZE / 2, SG_SIZE, SG_SIZE * 3 / 2, SG_SIZE * 3}; + for (auto wg_size : wg_sizes) { + std::cout << "WG_SIZE: " << wg_size << std::endl; + constexpr int sizes[] = {1, 2, 3, 4, 7, 8, 16, 17, 31, 32, 64, 67}; + sycl::detail::loop([&](auto i) { + constexpr int size = sizes[i]; + test_type_combo(wg_size); + test_type_combo(wg_size); + test_type_combo(wg_size); + test_type_combo(wg_size); + test_type_combo(wg_size); + test_type_combo(wg_size); + + // Disabled due to an IGC bug resulting in miscompilations. + // test_type_combo(wg_size); + }); + } +#endif + + return 0; +} diff --git a/sycl/test/check_device_code/group_load_store.cpp b/sycl/test/check_device_code/group_load_store.cpp new file mode 100644 index 0000000000000..80c960968cd6a --- /dev/null +++ b/sycl/test/check_device_code/group_load_store.cpp @@ -0,0 +1,129 @@ +// DEFINE: %{fcflags} = --check-prefixes %if windows %{CHECK,CHECK-WIN%} %else %{CHECK,CHECK-LIN%} + +// TODO: Remove -opaque-pointers when they are on by default. +// We want to use them right away for two reasons: +// 1) Less maintenance during the future switch +// 2) Generated LLVM IR is nicer and more readable in this mode +// RUN: %clangxx -fsycl-device-only -S -emit-llvm -Xclang -opaque-pointers -fno-sycl-instrument-device-code -o - %s | FileCheck %s %{fcflags} +#include + +using namespace sycl; + +using namespace sycl::ext::oneapi::experimental; + +using empty_props = decltype(properties()); +using full_sg_props = + decltype(properties(sycl::ext::oneapi::experimental::property::full_sg)); + +template +using plain_global_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::global_space>::type *; + +template +using plain_local_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::local_space>::type *; + +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sub_group, plain_global_ptr, int, full_sg_props>(sub_group, + plain_global_ptr, + int &, full_sg_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[BLOCK_LOAD:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef [[IN_PTR:%.*]]) +// CHECK-NEXT: store i32 [[BLOCK_LOAD]], ptr addrspace(4) [[OUT:%.*]] +// CHECK-NEXT: ret void + +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sub_group, global_ptr, int, full_sg_props>( + sub_group, global_ptr, int &, full_sg_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// sycl::multi_ptr is passed via "ptr noundef +// byval(%"class.sycl::_V1::multi_ptr") align 8 %in_ptr" +// CHECK-NEXT: [[IN_PTR_LOAD:%.*]] = load i64 +// CHECK-NEXT: [[IN_PTR:%.*]] = inttoptr i64 [[IN_PTR_LOAD]] to ptr addrspace(1) +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[BLOCK_LOAD:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef [[IN_PTR]]) +// CHECK-NEXT: store i32 [[BLOCK_LOAD]], ptr addrspace(4) [[OUT:%.*]] +// CHECK-NEXT: ret void + +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sub_group, plain_local_ptr, int, full_sg_props>(sub_group, + plain_local_ptr, + int &, full_sg_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[SG_LID:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId +// CHECK-NEXT: [[SG_LID_64:%.*]] = sext i32 [[SG_LID]] to i64 +// CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[IN_PTR:%.*]], i64 [[SG_LID_64]] +// CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) [[GEP]] +// CHECK-NEXT: store i32 [[LD]], ptr addrspace(4) [[OUT:%.*]] +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: ret void + +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sub_group, int *, int, full_sg_props>(sub_group, int *, int &, + full_sg_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TRY_CAST:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef [[IN_PTR:%.*]], i32 noundef 5) +// CHECK-NEXT: [[COND:%.*]] = icmp eq ptr addrspace(1) [[TRY_CAST]], null +// CHECK-NEXT: br i1 [[COND]], label %[[GENERIC:.*]], label %[[GLOBAL:.*]] +// CHECK-EMPTY: +// CHECK-NEXT: [[GLOBAL]]: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[BLOCK_LOAD:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[TRY_CAST]]) +// CHECK-NEXT: store i32 [[BLOCK_LOAD]], ptr addrspace(4) [[OUT:%.*]] +// CHECK-NEXT: br label %[[RET:.*]] +// CHECK-EMPTY: +// CHECK-NEXT: [[GENERIC]]: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[SG_LID:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId +// CHECK-NEXT: [[SG_LID_64:%.*]] = sext i32 [[SG_LID]] to i64 +// CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[IN_PTR]], i64 [[SG_LID_64]] +// CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(4) [[GEP]] +// CHECK-NEXT: store i32 [[LD]], ptr addrspace(4) [[OUT]] +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: br label %[[RET]] +// CHECK-EMPTY: +// CHECK-NEXT: [[RET]]: +// CHECK-NEXT: ret void + +using full_sg_striped_props = decltype(properties( + sycl::ext::oneapi::experimental::property::full_sg, + sycl::ext::oneapi::experimental::property::data_placement< + group_algorithm_data_placement::striped>)); +template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< + sub_group, plain_global_ptr, long long, 2, + full_sg_striped_props>(sub_group, plain_global_ptr, + vec &, full_sg_striped_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-LIN-NEXT: [[BLOCK_LOAD:%.*]] = tail call spir_func noundef <2 x i64> @_Z30__spirv_SubgroupBlockReadINTELIDv2_mET_PU3AS1Km(ptr addrspace(1) noundef [[IN_PTR:%.*]]) +// CHECK-WIN-NEXT: [[BLOCK_LOAD:%.*]] = tail call spir_func noundef <2 x i64> @_Z30__spirv_SubgroupBlockReadINTELIDv2_yET_PU3AS1Ky(ptr addrspace(1) noundef [[IN_PTR:%.*]]) +// CHECK-NEXT: [[EXTRACT_0:%.*]] = extractelement <2 x i64> [[BLOCK_LOAD]], i64 0 +// CHECK-NEXT: store i64 [[EXTRACT_0]], ptr addrspace(4) [[OUT_PTR:%[^,]*]] +// CHECK-NEXT: [[EXTRACT_1:%.*]] = extractelement <2 x i64> [[BLOCK_LOAD]], i64 1 +// CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr addrspace(4) [[OUT_PTR]], i64 1 +// CHECK-NEXT: store i64 [[EXTRACT_1]], ptr addrspace(4) [[GEP]] +// CHECK-NEXT: ret void + +template SYCL_EXTERNAL void +sycl::ext::oneapi::experimental::group_load, + int, 16, full_sg_striped_props>( + sub_group, plain_global_ptr, vec &, full_sg_striped_props); +// CHECK-LABEL: define {{.*}}group_load +// CHECK-NEXT: entry: +// CHECK-NEXT: call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) +// CHECK-NEXT: [[SG_SIZE:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupMaxSize +// CHECK-NEXT: [[SG_SIZE_64:%.*]] = zext i32 [[SG_SIZE]] to i64 +// CHECK-NEXT: [[BYTES_PER_SG:%.*]] = shl nuw nsw i64 [[SG_SIZE_64]], 3 +// CHECK-NEXT: [[BLOCK_LOAD_0:%.*]] = tail call spir_func noundef <8 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv8_jET_PU3AS1Kj(ptr addrspace(1) noundef [[IN_PTR:%.*]]) +// CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[BYTES_PER_SG]] +// CHECK-NEXT: [[BLOCK_LOAD_1:%.*]] = tail call spir_func noundef <8 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv8_jET_PU3AS1Kj(ptr addrspace(1) noundef [[GEP]]) +// 16 * (extract/gep/store) - 1 (0-th gep) +// CHECK-COUNT-47: {{extractelement|getelementptr|store i32}} +// CHECK-NEXT: ret void diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index 3ca752c46b182..8ab6e5e74cc4d 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -42,6 +42,9 @@ # Propagate some variables from the host environment. llvm_config.with_system_environment(['PATH', 'OCL_ICD_FILENAMES', 'SYCL_DEVICE_ALLOWLIST', 'SYCL_CONFIG_FILE_NAME']) +# Allow expanding substitutions that are based on other substitutions +config.recursiveExpansionLimit = 10 + config.substitutions.append(('%python', '"%s"' % (sys.executable))) # Propagate extra environment variables