Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/ck/utility/common_header.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "ck/utility/amd_address_space.hpp"
#include "ck/utility/static_buffer.hpp"
#include "ck/utility/dynamic_buffer.hpp"
#include "ck/utility/readfirstlane.hpp"

// TODO: remove this
#if CK_USE_AMD_INLINE_ASM
Expand Down
84 changes: 84 additions & 0 deletions include/ck/utility/readfirstlane.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

#pragma once

#include "ck/ck.hpp"
#include "ck/utility/functional2.hpp"
#include "ck/utility/math.hpp"

#include <cstddef>
#include <cstdint>
#include <type_traits>

namespace ck {
namespace detail {

template <unsigned Size>
struct get_unsigned_int;

template <>
struct get_unsigned_int<1>
{
using type = uint8_t;
};

template <>
struct get_unsigned_int<2>
{
using type = uint16_t;
};

template <>
struct get_unsigned_int<4>
{
using type = uint32_t;
};

template <unsigned Size>
using get_unsigned_int_t = typename get_unsigned_int<Size>::type;

} // namespace detail

__device__ inline int32_t readfirstlane(int32_t value)
{
return __builtin_amdgcn_readfirstlane(value);
}

template <
typename Object,
typename = std::enable_if_t<std::is_class_v<Object> && std::is_trivially_copyable_v<Object>>>
__device__ auto readfirstlane(const Object& obj)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a AMD specific function, not standard HIP. I think better rename to "amd_wave_read_first_lane", similar to "amd_buffer_load"

{
using Size = unsigned;
constexpr Size SgprSize = 4;
constexpr Size ObjectSize = sizeof(Object);

using Sgpr = detail::get_unsigned_int_t<SgprSize>;

alignas(Object) std::byte to_obj[ObjectSize];

auto* const from_obj = reinterpret_cast<const std::byte*>(&obj);

constexpr Size RemainedSize = ObjectSize % SgprSize;
constexpr Size CompleteSgprCopyBoundary = ObjectSize - RemainedSize;
for(Size offset = 0; offset < CompleteSgprCopyBoundary; offset += SgprSize)
{
*reinterpret_cast<Sgpr*>(to_obj + offset) =
readfirstlane(*reinterpret_cast<const Sgpr*>(from_obj + offset));
}

if constexpr(0 < RemainedSize)
{
using Carrier = detail::get_unsigned_int_t<RemainedSize>;

*reinterpret_cast<Carrier>(to_obj + CompleteSgprCopyBoundary) =
readfirstlane(*reinterpret_cast<const Carrier*>(from_obj + CompleteSgprCopyBoundary));
}

/// NOTE: Implicitly start object lifetime. It's better to use std::start_lifetime_at() in this
/// scenario
return *reinterpret_cast<Object*>(to_obj);
}

} // namespace ck