Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ck/utility/common_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/static_buffer.hpp"
#include "ck/utility/dynamic_buffer.hpp"
#include "ck/utility/readfirstlane.hpp"

// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
Expand Down
82 changes: 82 additions & 0 deletions include/ck/utility/readfirstlane.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

#pragma once

#include "ck/ck.hpp"
#include "ck/utility/functional2.hpp"
#include "ck/utility/math.hpp"

#include <cstddef>
#include <cstdint>
#include <type_traits>

namespace ck {
namespace detail {

template <std::size_t Size>
Comment thread
asroy marked this conversation as resolved.
Outdated
struct get_signed_int;

template <>
struct get_signed_int<1>
{
using type = std::int8_t;
Comment thread
asroy marked this conversation as resolved.
Outdated
};

template <>
struct get_signed_int<2>
{
using type = std::int16_t;
};

template <>
struct get_signed_int<4>
{
using type = std::int32_t;
};

template <std::size_t Size>
using get_signed_int_t = typename get_signed_int<Size>::type;

} // namespace detail

__device__ inline std::int32_t readfirstlane(std::int32_t value)
{
return __builtin_amdgcn_readfirstlane(value);
}

template <
typename Object,
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
__device__ auto readfirstlane(const Object& obj)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a AMD specific function, not standard HIP. I think better rename to "amd_wave_read_first_lane", similar to "amd_buffer_load"

{
constexpr std::size_t SgprSize = 4;
constexpr std::size_t ObjectSize = sizeof(Object);

using Sgpr = detail::get_signed_int_t<SgprSize>;

alignas(Object) std::byte to_obj[ObjectSize];

auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);

constexpr std::size_t RemainedSize = ObjectSize % SgprSize;
constexpr std::size_t CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
static_for<0, CompleteSgprCopyBoundary, SgprSize>{}([&](auto offset) {
Comment thread
asroy marked this conversation as resolved.
Outdated
*reinterpret_cast<Sgpr*>(to_obj + offset) =
readfirstlane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
});

if constexpr(0 < RemainedSize)
{
using Carrier = detail::get_signed_int_t<RemainedSize>;

*reinterpret_cast<Carrier>(to_obj + CompleteSgprCopyBoundary) =
readfirstlane(*reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
}

/// NOTE: Implicitly start object lifetime. It's better to use
/// std::start_lifetime_at() in this scenario
return *reinterpret_cast<Object*>(to_obj);
}

} // namespace ck