From dea4506e9bed8dba9046f25a38055b85d7e65e84 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 03:36:23 +0800 Subject: [PATCH 01/15] Add overloaded version of __builtin_amdgcn_readfirstlane() --- include/ck/utility/common_header.hpp | 1 + include/ck/utility/readfirstlane.hpp | 79 ++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 include/ck/utility/readfirstlane.hpp diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 1378bbe448e..3f3ef88affb 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -40,6 +40,7 @@ #include "ck/utility/amd_address_space.hpp" #include "ck/utility/static_buffer.hpp" #include "ck/utility/dynamic_buffer.hpp" +#include "ck/utility/readfirstlane.hpp" // TODO: remove this #if CK_USE_AMD_INLINE_ASM diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp new file mode 100644 index 00000000000..9b521db4a27 --- /dev/null +++ b/include/ck/utility/readfirstlane.hpp @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/ck.hpp" +#include "ck/utility/functional2.hpp" +#include "ck/utility/math.hpp" + +#include +#include + +namespace ck { +namespace detail { + +template +struct get_signed_int; + +template <> +struct get_signed_int<1> +{ + using type = std::int8_t; +}; + +template <> +struct get_signed_int<2> +{ + using type = std::int16_t; +}; + +template <> +struct get_signed_int<4> +{ + using type = std::int32_t; +}; + +template +using get_signed_int_t = typename get_signed_int::type; + +} // namespace detail + +__device__ std::int32_t readfirstlane(std::int32_t value) +{ + return __builtin_amdgcn_readfirstlane(value); +} + +template < + typename Object, + typename = std::enable_if_t && std::is_trivially_copyable_v>> +__device__ auto readfirstlane(const Object& obj) +{ + static constexpr std::size_t SgprSize = 4; + static constexpr std::size_t ObjectSize = sizeof(Object); + + using Sgpr = detail::get_signed_int_t; + + alignas(Object) unsigned char memory[ObjectSize]; + + const auto* from = reinterpret_cast(&obj); + static_for<0, ObjectSize, SgprSize>{}([&](auto offset) { + *reinterpret_cast(memory + offset) = + readfirstlane(*reinterpret_cast(from + offset)); + }); + + static constexpr std::size_t RemainedSize = ObjectSize % SgprSize; + if constexpr(0 < RemainedSize) + { + using Carrier = detail::get_signed_int_t; + + constexpr std::size_t offset = SgprSize * math::integer_divide_floor(ObjectSize, SgprSize); + + *reinterpret_cast(memory + offset) = + readfirstlane(*reinterpret_cast(from + offset)); + } + + return *reinterpret_cast(memory); +} + +} // namespace ck From 55a81942b4879984a938dfe98af7eb4d39eeaee1 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 03:46:28 +0800 Subject: [PATCH 02/15] Remove 'static' specifiers --- include/ck/utility/readfirstlane.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 9b521db4a27..43f4771f520 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -49,8 +49,8 @@ template < typename = std::enable_if_t && std::is_trivially_copyable_v>> __device__ auto readfirstlane(const Object& obj) { - static constexpr std::size_t SgprSize = 4; - static constexpr std::size_t ObjectSize = sizeof(Object); + constexpr std::size_t SgprSize = 4; + constexpr std::size_t ObjectSize = sizeof(Object); using Sgpr = detail::get_signed_int_t; From a8d4294c813c44803717c41352f7f2649b70cd72 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 03:50:28 +0800 Subject: [PATCH 03/15] Remove more 'static' specifier --- include/ck/utility/readfirstlane.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 43f4771f520..2aeffbe731d 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -62,7 +62,7 @@ __device__ auto readfirstlane(const Object& obj) readfirstlane(*reinterpret_cast(from + offset)); }); - static constexpr std::size_t RemainedSize = ObjectSize % SgprSize; + constexpr std::size_t RemainedSize = ObjectSize % SgprSize; if constexpr(0 < RemainedSize) { using Carrier = detail::get_signed_int_t; From a609bfaad2f1217f4b798ca11ebc048e0d48c3d1 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 06:18:26 +0800 Subject: [PATCH 04/15] Replace unsigne char by std::byte --- include/ck/utility/readfirstlane.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 2aeffbe731d..c00045e717d 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -7,6 +7,7 @@ #include "ck/utility/functional2.hpp" #include "ck/utility/math.hpp" +#include #include #include @@ -54,9 +55,9 @@ __device__ auto readfirstlane(const Object& obj) using Sgpr = detail::get_signed_int_t; - alignas(Object) unsigned char memory[ObjectSize]; + alignas(Object) std::byte memory[ObjectSize]; - const auto* from = reinterpret_cast(&obj); + const auto* from = reinterpret_cast(&obj); static_for<0, ObjectSize, SgprSize>{}([&](auto offset) { *reinterpret_cast(memory + offset) = readfirstlane(*reinterpret_cast(from + offset)); From fb51f338d706b4bad2e84d230e0ecff43752e649 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 06:21:59 +0800 Subject: [PATCH 05/15] Add 'const' specifier to never changing variable --- include/ck/utility/readfirstlane.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index c00045e717d..a993be62375 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -57,7 +57,7 @@ __device__ auto readfirstlane(const Object& obj) alignas(Object) std::byte memory[ObjectSize]; - const auto* from = reinterpret_cast(&obj); + auto* const from = reinterpret_cast(&obj); static_for<0, ObjectSize, SgprSize>{}([&](auto offset) { *reinterpret_cast(memory + offset) = readfirstlane(*reinterpret_cast(from + offset)); From 8b7ea41ddb6c68fa20a29c4c9a6efd60a55547c9 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 17 May 2023 15:19:05 +0800 Subject: [PATCH 06/15] Add 'inline' specifier to funcion definition --- include/ck/utility/readfirstlane.hpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index a993be62375..6f3e608d35e 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -40,7 +40,7 @@ using get_signed_int_t = typename get_signed_int::type; } // namespace detail -__device__ std::int32_t readfirstlane(std::int32_t value) +__device__ inline std::int32_t readfirstlane(std::int32_t value) { return __builtin_amdgcn_readfirstlane(value); } @@ -55,12 +55,12 @@ __device__ auto readfirstlane(const Object& obj) using Sgpr = detail::get_signed_int_t; - alignas(Object) std::byte memory[ObjectSize]; + alignas(Object) std::byte to_obj[ObjectSize]; - auto* const from = reinterpret_cast(&obj); + auto* const from_obj = reinterpret_cast(&obj); static_for<0, ObjectSize, SgprSize>{}([&](auto offset) { - *reinterpret_cast(memory + offset) = - readfirstlane(*reinterpret_cast(from + offset)); + *reinterpret_cast(to_obj + offset) = + readfirstlane(*reinterpret_cast(from_obj + offset)); }); constexpr std::size_t RemainedSize = ObjectSize % SgprSize; @@ -70,11 +70,13 @@ __device__ auto readfirstlane(const Object& obj) constexpr std::size_t offset = SgprSize * math::integer_divide_floor(ObjectSize, SgprSize); - *reinterpret_cast(memory + offset) = - readfirstlane(*reinterpret_cast(from + offset)); + *reinterpret_cast(to_obj + offset) = + readfirstlane(*reinterpret_cast(from_obj + offset)); } - return *reinterpret_cast(memory); + /// NOTE: Implicitly start object lifetime. It's better to use + // std::start_lifetime_at() in this scenario + return *reinterpret_cast(to_obj); } } // namespace ck From ccebca5e4d56b1c14907f4c8a4417d60b8536b99 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Thu, 18 May 2023 08:29:06 +0800 Subject: [PATCH 07/15] Fix wrong boundar calculation logic --- include/ck/utility/readfirstlane.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 6f3e608d35e..0e333761e2b 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -58,24 +58,24 @@ __device__ auto readfirstlane(const Object& obj) alignas(Object) std::byte to_obj[ObjectSize]; auto* const from_obj = reinterpret_cast(&obj); - static_for<0, ObjectSize, SgprSize>{}([&](auto offset) { + + constexpr std::size_t RemainedSize = ObjectSize % SgprSize; + constexpr std::size_t CompleteSgprCopyBoundary = ObjectSize - RemainedSize; + static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) { *reinterpret_cast(to_obj + offset) = readfirstlane(*reinterpret_cast(from_obj + offset)); }); - constexpr std::size_t RemainedSize = ObjectSize % SgprSize; if constexpr(0 < RemainedSize) { using Carrier = detail::get_signed_int_t; - constexpr std::size_t offset = SgprSize * math::integer_divide_floor(ObjectSize, SgprSize); - - *reinterpret_cast(to_obj + offset) = - readfirstlane(*reinterpret_cast(from_obj + offset)); + *reinterpret_cast(to_obj + CompleteSgprCopyBoundary) = + readfirstlane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); } /// NOTE: Implicitly start object lifetime. It's better to use - // std::start_lifetime_at() in this scenario + /// std::start_lifetime_at() in this scenario return *reinterpret_cast(to_obj); } From 48feb2867da8d024bad9d1fc201b1fd1b7c6a698 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Mon, 29 May 2023 14:21:31 +0800 Subject: [PATCH 08/15] Rename type trait --- include/ck/utility/readfirstlane.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 0e333761e2b..123dce6c5f0 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -15,28 +15,28 @@ namespace ck { namespace detail { template -struct get_signed_int; +struct get_unsigned_int; template <> -struct get_signed_int<1> +struct get_unsigned_int<1> { - using type = std::int8_t; + using type = std::uint8_t; }; template <> -struct get_signed_int<2> +struct get_unsigned_int<2> { - using type = std::int16_t; + using type = std::uint16_t; }; template <> -struct get_signed_int<4> +struct get_unsigned_int<4> { - using type = std::int32_t; + using type = std::uint32_t; }; template -using get_signed_int_t = typename get_signed_int::type; +using get_unsigned_int_t = typename get_unsigned_int::type; } // namespace detail @@ -53,7 +53,7 @@ __device__ auto readfirstlane(const Object& obj) constexpr std::size_t SgprSize = 4; constexpr std::size_t ObjectSize = sizeof(Object); - using Sgpr = detail::get_signed_int_t; + using Sgpr = detail::get_unsigned_int_t; alignas(Object) std::byte to_obj[ObjectSize]; @@ -68,7 +68,7 @@ __device__ auto readfirstlane(const Object& obj) if constexpr(0 < RemainedSize) { - using Carrier = detail::get_signed_int_t; + using Carrier = detail::get_unsigned_int_t; *reinterpret_cast(to_obj + CompleteSgprCopyBoundary) = readfirstlane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); From fc3df3ba3eda9691b5d959138a14c1b3d18f365b Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Mon, 29 May 2023 14:23:05 +0800 Subject: [PATCH 09/15] Remove std:: qualifier from standard types --- include/ck/utility/readfirstlane.hpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 123dce6c5f0..10dd2142b9c 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -14,33 +14,33 @@ namespace ck { namespace detail { -template +template struct get_unsigned_int; template <> struct get_unsigned_int<1> { - using type = std::uint8_t; + using type = uint8_t; }; template <> struct get_unsigned_int<2> { - using type = std::uint16_t; + using type = uint16_t; }; template <> struct get_unsigned_int<4> { - using type = std::uint32_t; + using type = uint32_t; }; -template +template using get_unsigned_int_t = typename get_unsigned_int::type; } // namespace detail -__device__ inline std::int32_t readfirstlane(std::int32_t value) +__device__ inline int32_t readfirstlane(int32_t value) { return __builtin_amdgcn_readfirstlane(value); } @@ -50,8 +50,8 @@ template < typename = std::enable_if_t && std::is_trivially_copyable_v>> __device__ auto readfirstlane(const Object& obj) { - constexpr std::size_t SgprSize = 4; - constexpr std::size_t ObjectSize = sizeof(Object); + constexpr size_t SgprSize = 4; + constexpr size_t ObjectSize = sizeof(Object); using Sgpr = detail::get_unsigned_int_t; @@ -59,8 +59,8 @@ __device__ auto readfirstlane(const Object& obj) auto* const from_obj = reinterpret_cast(&obj); - constexpr std::size_t RemainedSize = ObjectSize % SgprSize; - constexpr std::size_t CompleteSgprCopyBoundary = ObjectSize - RemainedSize; + constexpr size_t RemainedSize = ObjectSize % SgprSize; + constexpr size_t CompleteSgprCopyBoundary = ObjectSize - RemainedSize; static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) { *reinterpret_cast(to_obj + offset) = readfirstlane(*reinterpret_cast(from_obj + offset)); From 813d40678da2ccc12e1a5fe0166de93f2888ad8b Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Mon, 29 May 2023 14:24:38 +0800 Subject: [PATCH 10/15] Replace 'size_t' by 'unsigned' --- include/ck/utility/readfirstlane.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index 10dd2142b9c..c676e505efb 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -14,7 +14,7 @@ namespace ck { namespace detail { -template +template struct get_unsigned_int; template <> @@ -35,7 +35,7 @@ struct get_unsigned_int<4> using type = uint32_t; }; -template +template using get_unsigned_int_t = typename get_unsigned_int::type; } // namespace detail @@ -50,8 +50,8 @@ template < typename = std::enable_if_t && std::is_trivially_copyable_v>> __device__ auto readfirstlane(const Object& obj) { - constexpr size_t SgprSize = 4; - constexpr size_t ObjectSize = sizeof(Object); + constexpr unsigned SgprSize = 4; + constexpr unsigned ObjectSize = sizeof(Object); using Sgpr = detail::get_unsigned_int_t; @@ -59,8 +59,8 @@ __device__ auto readfirstlane(const Object& obj) auto* const from_obj = reinterpret_cast(&obj); - constexpr size_t RemainedSize = ObjectSize % SgprSize; - constexpr size_t CompleteSgprCopyBoundary = ObjectSize - RemainedSize; + constexpr unsigned RemainedSize = ObjectSize % SgprSize; + constexpr unsigned CompleteSgprCopyBoundary = ObjectSize - RemainedSize; static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) { *reinterpret_cast(to_obj + offset) = readfirstlane(*reinterpret_cast(from_obj + offset)); @@ -74,8 +74,8 @@ __device__ auto readfirstlane(const Object& obj) readfirstlane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); } - /// NOTE: Implicitly start object lifetime. It's better to use - /// std::start_lifetime_at() in this scenario + /// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this + /// scenario return *reinterpret_cast(to_obj); } From 0840e01628cbb15b32c22382104db6413c1ebd02 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Mon, 29 May 2023 14:40:14 +0800 Subject: [PATCH 11/15] Use type alias to hint usage --- include/ck/utility/readfirstlane.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index c676e505efb..defeeb76199 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -50,8 +50,9 @@ template < typename = std::enable_if_t && std::is_trivially_copyable_v>> __device__ auto readfirstlane(const Object& obj) { - constexpr unsigned SgprSize = 4; - constexpr unsigned ObjectSize = sizeof(Object); + using Size = unsigned; + constexpr Size SgprSize = 4; + constexpr Size ObjectSize = sizeof(Object); using Sgpr = detail::get_unsigned_int_t; @@ -59,8 +60,8 @@ __device__ auto readfirstlane(const Object& obj) auto* const from_obj = reinterpret_cast(&obj); - constexpr unsigned RemainedSize = ObjectSize % SgprSize; - constexpr unsigned CompleteSgprCopyBoundary = ObjectSize - RemainedSize; + constexpr Size RemainedSize = ObjectSize % SgprSize; + constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) { *reinterpret_cast(to_obj + offset) = readfirstlane(*reinterpret_cast(from_obj + offset)); From ad8bc60bc8674f82cfd59ec60e9f8cedb1ff15fb Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Mon, 29 May 2023 14:41:32 +0800 Subject: [PATCH 12/15] Replace static_for<> by ordinary 'for' loop --- include/ck/utility/readfirstlane.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index defeeb76199..f122d5fd8d3 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -62,10 +62,11 @@ __device__ auto readfirstlane(const Object& obj) constexpr Size RemainedSize = ObjectSize % SgprSize; constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; - static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) { + for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize) + { *reinterpret_cast(to_obj + offset) = readfirstlane(*reinterpret_cast(from_obj + offset)); - }); + } if constexpr(0 < RemainedSize) { From e698fdb414fc45e373cd75b7024bbd8348a8acd1 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 31 May 2023 21:01:11 +0800 Subject: [PATCH 13/15] Rename readfirstlane() to amd_wave_read_first_lane() --- include/ck/utility/readfirstlane.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/readfirstlane.hpp index f122d5fd8d3..f92a5ebcaab 100644 --- a/include/ck/utility/readfirstlane.hpp +++ b/include/ck/utility/readfirstlane.hpp @@ -40,7 +40,7 @@ using get_unsigned_int_t = typename get_unsigned_int::type; } // namespace detail -__device__ inline int32_t readfirstlane(int32_t value) +__device__ inline int32_t amd_wave_read_first_lane(int32_t value) { return __builtin_amdgcn_readfirstlane(value); } @@ -48,7 +48,7 @@ __device__ inline int32_t readfirstlane(int32_t value) template < typename Object, typename = std::enable_if_t && std::is_trivially_copyable_v>> -__device__ auto readfirstlane(const Object& obj) +__device__ auto amd_wave_read_first_lane(const Object& obj) { using Size = unsigned; constexpr Size SgprSize = 4; @@ -65,7 +65,7 @@ __device__ auto readfirstlane(const Object& obj) for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize) { *reinterpret_cast(to_obj + offset) = - readfirstlane(*reinterpret_cast(from_obj + offset)); + amd_wave_read_first_lane(*reinterpret_cast(from_obj + offset)); } if constexpr(0 < RemainedSize) @@ -73,7 +73,7 @@ __device__ auto readfirstlane(const Object& obj) using Carrier = detail::get_unsigned_int_t; *reinterpret_cast(to_obj + CompleteSgprCopyBoundary) = - readfirstlane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); + amd_wave_read_first_lane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); } /// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this From 232972e4a605760389366a31e5c9252f6dccfb43 Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 31 May 2023 21:02:25 +0800 Subject: [PATCH 14/15] Rename file readfirstlance.hpp as amd_wave_read_first_lane.hpp --- .../utility/{readfirstlane.hpp => amd_wave_read_first_lane.hpp} | 0 include/ck/utility/common_header.hpp | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename include/ck/utility/{readfirstlane.hpp => amd_wave_read_first_lane.hpp} (100%) diff --git a/include/ck/utility/readfirstlane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp similarity index 100% rename from include/ck/utility/readfirstlane.hpp rename to include/ck/utility/amd_wave_read_first_lane.hpp diff --git a/include/ck/utility/common_header.hpp b/include/ck/utility/common_header.hpp index 3f3ef88affb..8da87c8766e 100644 --- a/include/ck/utility/common_header.hpp +++ b/include/ck/utility/common_header.hpp @@ -33,6 +33,7 @@ #include "ck/utility/debug.hpp" #include "ck/utility/amd_buffer_addressing.hpp" +#include "ck/utility/amd_wave_read_first_lane.hpp" #include "ck/utility/generic_memory_space_atomic.hpp" #include "ck/utility/get_id.hpp" #include "ck/utility/thread_group.hpp" @@ -40,7 +41,6 @@ #include "ck/utility/amd_address_space.hpp" #include "ck/utility/static_buffer.hpp" #include "ck/utility/dynamic_buffer.hpp" -#include "ck/utility/readfirstlane.hpp" // TODO: remove this #if CK_USE_AMD_INLINE_ASM From 1001c73115f09ac7e0c5eb3d58b36c5f9a0a6ade Mon Sep 17 00:00:00 2001 From: "Po-Yen, Chen" Date: Wed, 31 May 2023 21:09:12 +0800 Subject: [PATCH 15/15] Reorder statements --- include/ck/utility/amd_wave_read_first_lane.hpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp index f92a5ebcaab..4652ce7a747 100644 --- a/include/ck/utility/amd_wave_read_first_lane.hpp +++ b/include/ck/utility/amd_wave_read_first_lane.hpp @@ -54,16 +54,15 @@ __device__ auto amd_wave_read_first_lane(const Object& obj) constexpr Size SgprSize = 4; constexpr Size ObjectSize = sizeof(Object); - using Sgpr = detail::get_unsigned_int_t; - - alignas(Object) std::byte to_obj[ObjectSize]; - auto* const from_obj = reinterpret_cast(&obj); + alignas(Object) std::byte to_obj[ObjectSize]; constexpr Size RemainedSize = ObjectSize % SgprSize; constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize; for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize) { + using Sgpr = detail::get_unsigned_int_t; + *reinterpret_cast(to_obj + offset) = amd_wave_read_first_lane(*reinterpret_cast(from_obj + offset)); } @@ -72,8 +71,8 @@ __device__ auto amd_wave_read_first_lane(const Object& obj) { using Carrier = detail::get_unsigned_int_t; - *reinterpret_cast(to_obj + CompleteSgprCopyBoundary) = - amd_wave_read_first_lane(*reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); + *reinterpret_cast(to_obj + CompleteSgprCopyBoundary) = amd_wave_read_first_lane( + *reinterpret_cast(from_obj + CompleteSgprCopyBoundary)); } /// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this