Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xxhash cleanups #313

Merged
merged 10 commits into from
Jun 5, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,7 @@ ConfigureBench(DYNAMIC_MAP_BENCH
hash_table/dynamic_map/contains_bench.cu
hash_table/dynamic_map/erase_bench.cu)

###################################################################################################
# - hash function benchmarks ----------------------------------------------------------------------
ConfigureBench(HASH_BENCH
hash_bench.cu)
10 changes: 3 additions & 7 deletions benchmarks/hash_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/

#include <defaults.hpp>
#include <utils.hpp>

#include <cuco/detail/utils.hpp>
#include <cuco/hash_functions.cuh>
Expand All @@ -26,9 +25,6 @@

#include <cstdint>

using namespace cuco::benchmark;
using namespace cuco::utility;

template <int32_t Words>
struct large_key {
constexpr __host__ __device__ large_key(int32_t seed) noexcept
Expand Down Expand Up @@ -73,8 +69,8 @@ void hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
{
bool const materialize_result = false;
constexpr auto block_size = 128;
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N * 10);
auto const grid_size = SDIV(num_keys, block_size * 16);
auto const num_keys = state.get_int64_or_default("NumInputs", cuco::benchmark::defaults::N * 10);
auto const grid_size = SDIV(num_keys, block_size * 16);

thrust::device_vector<typename Hash::result_type> hash_values((materialize_result) ? num_keys
: 1);
Expand Down Expand Up @@ -102,4 +98,4 @@ NVBENCH_BENCH_TYPES(
cuco::murmurhash3_fmix_64<nvbench::int64_t>>))
.set_name("hash_function_eval")
.set_type_axes_names({"Hash"})
.set_max_noise(defaults::MAX_NOISE);
.set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
94 changes: 47 additions & 47 deletions include/cuco/detail/hash_functions/xxhash.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,22 @@ namespace cuco::detail {
template <typename Key>
struct XXHash_32 {
private:
static constexpr uint32_t prime1 = 0x9E3779B1U;
static constexpr uint32_t prime2 = 0x85EBCA77U;
static constexpr uint32_t prime3 = 0xC2B2AE3DU;
static constexpr uint32_t prime4 = 0x27D4EB2FU;
static constexpr uint32_t prime5 = 0x165667B1U;
static constexpr std::uint32_t prime1 = 0x9E3779B1u;
static constexpr std::uint32_t prime2 = 0x85EBCA77u;
static constexpr std::uint32_t prime3 = 0xC2B2AE3Du;
static constexpr std::uint32_t prime4 = 0x27D4EB2Fu;
static constexpr std::uint32_t prime5 = 0x165667B1u;
sleeepyjack marked this conversation as resolved.
Show resolved Hide resolved

public:
using argument_type = Key; ///< The type of the values taken as argument
using result_type = uint32_t; ///< The type of the hash values produced
using argument_type = Key; ///< The type of the values taken as argument
using result_type = std::uint32_t; ///< The type of the hash values produced

/**
* @brief Constructs a XXH32 hash function with the given `seed`.
*
* @param seed A custom number to randomize the resulting hash value
*/
__host__ __device__ constexpr XXHash_32(uint32_t seed = 0) : seed_{seed} {}
__host__ __device__ constexpr XXHash_32(std::uint32_t seed = 0) : seed_{seed} {}

/**
* @brief Returns a hash value for its argument, as a value of type `result_type`.
Expand All @@ -86,20 +86,20 @@ struct XXHash_32 {
constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
{
// TODO do we need to add checks/hints for alignment?
Copy link
Collaborator Author

@sleeepyjack sleeepyjack Jun 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can someone comment on this? I'm not sure if the excessive reinterpret_casting leads to pitfalls.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the offline discussion, downcasting to the char pointer shouldn't cause any problems and blocks (or the uint32_t pointer) won't be used if the key size is smaller than 4 bytes so the current implementation should be safe.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

8c4f167 sprinkles in some [[maybe_unused]] attributes for good measure.

constexpr auto nbytes = sizeof(Key);
char const* const bytes = (char const*)&key; ///< per-byte access
uint32_t const* const blocks = (uint32_t const*)&key; ///< 4-byte word access
constexpr auto nbytes = sizeof(Key);
auto const bytes = reinterpret_cast<char const*>(&key); ///< per-byte access
auto const blocks = reinterpret_cast<std::uint32_t const*>(&key); ///< 4-byte word access

uint32_t offset = 0;
uint32_t h32;
std::size_t offset = 0;
std::uint32_t h32;

// data can be processed in 16-byte chunks
if constexpr (nbytes >= 16) {
constexpr auto limit = nbytes - 16;
uint32_t v1 = seed_ + prime1 + prime2;
uint32_t v2 = seed_ + prime2;
uint32_t v3 = seed_;
uint32_t v4 = seed_ - prime1;
std::uint32_t v1 = seed_ + prime1 + prime2;
std::uint32_t v2 = seed_ + prime2;
std::uint32_t v3 = seed_;
std::uint32_t v4 = seed_ - prime1;

do {
// pipeline 4*4byte computations
Expand Down Expand Up @@ -134,7 +134,7 @@ struct XXHash_32 {
}
}

// the following loop is only needed if the size of the key is no multiple of the block size
// the following loop is only needed if the size of the key is not a multiple of the block size
if constexpr (nbytes % 4) {
while (offset < nbytes) {
h32 += (bytes[offset] & 255) * prime5;
Expand All @@ -147,13 +147,13 @@ struct XXHash_32 {
}

private:
constexpr __host__ __device__ uint32_t rotl(uint32_t h, int8_t r) const noexcept
constexpr __host__ __device__ std::uint32_t rotl(std::uint32_t h, std::int8_t r) const noexcept
{
return ((h << r) | (h >> (32 - r)));
}

// avalanche helper
constexpr __host__ __device__ uint32_t finalize(uint32_t h) const noexcept
constexpr __host__ __device__ std::uint32_t finalize(std::uint32_t h) const noexcept
{
h ^= h >> 15;
h *= prime2;
Expand All @@ -163,7 +163,7 @@ struct XXHash_32 {
return h;
}

uint32_t seed_;
std::uint32_t seed_;
};

/**
Expand Down Expand Up @@ -210,22 +210,22 @@ struct XXHash_32 {
template <typename Key>
struct XXHash_64 {
private:
static constexpr uint64_t prime1 = 11400714785074694791ULL;
static constexpr uint64_t prime2 = 14029467366897019727ULL;
static constexpr uint64_t prime3 = 1609587929392839161ULL;
static constexpr uint64_t prime4 = 9650029242287828579ULL;
static constexpr uint64_t prime5 = 2870177450012600261ULL;
static constexpr std::uint64_t prime1 = 11400714785074694791ull;
static constexpr std::uint64_t prime2 = 14029467366897019727ull;
static constexpr std::uint64_t prime3 = 1609587929392839161ull;
static constexpr std::uint64_t prime4 = 9650029242287828579ull;
static constexpr std::uint64_t prime5 = 2870177450012600261ull;

public:
using argument_type = Key; ///< The type of the values taken as argument
using result_type = uint64_t; ///< The type of the hash values produced
using argument_type = Key; ///< The type of the values taken as argument
using result_type = std::uint64_t; ///< The type of the hash values produced

/**
* @brief Constructs a XXH64 hash function with the given `seed`.
*
* @param seed A custom number to randomize the resulting hash value
*/
__host__ __device__ constexpr XXHash_64(uint64_t seed = 0) : seed_{seed} {}
__host__ __device__ constexpr XXHash_64(std::uint64_t seed = 0) : seed_{seed} {}

/**
* @brief Returns a hash value for its argument, as a value of type `result_type`.
Expand All @@ -236,21 +236,21 @@ struct XXHash_64 {
constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
{
// TODO do we need to add checks/hints for alignment?
constexpr auto nbytes = sizeof(Key);
char const* const bytes = (char const*)&key; ///< per-byte access
uint32_t const* const blocks4 = (uint32_t const*)&key; ///< 4-byte word access
uint64_t const* const blocks8 = (uint64_t const*)&key; ///< 8-byte word access
constexpr auto nbytes = sizeof(Key);
auto const bytes = reinterpret_cast<char const*>(&key); ///< per-byte access
auto const blocks4 = reinterpret_cast<std::uint32_t const*>(&key); ///< 4-byte word access
auto const blocks8 = reinterpret_cast<std::uint64_t const*>(&key); ///< 8-byte word access

uint64_t offset = 0;
uint64_t h64;
std::size_t offset = 0;
std::uint64_t h64;

// data can be processed in 32-byte chunks
if constexpr (nbytes >= 32) {
constexpr auto limit = nbytes - 32;
uint64_t v1 = seed_ + prime1 + prime2;
uint64_t v2 = seed_ + prime2;
uint64_t v3 = seed_;
uint64_t v4 = seed_ - prime1;
std::uint64_t v1 = seed_ + prime1 + prime2;
std::uint64_t v2 = seed_ + prime2;
std::uint64_t v3 = seed_;
std::uint64_t v4 = seed_ - prime1;

do {
// pipeline 4*8byte computations
Expand Down Expand Up @@ -304,8 +304,8 @@ struct XXHash_64 {
// remaining data can be processed in 8-byte chunks
if constexpr ((nbytes % 32) >= 8) {
for (; offset <= nbytes - 8; offset += 8) {
uint64_t k1 = blocks8[offset / 8] * prime2;
k1 = rotl(k1, 31) * prime1;
std::uint64_t k1 = blocks8[offset / 8] * prime2;
k1 = rotl(k1, 31) * prime1;
h64 ^= k1;
h64 = rotl(h64, 27) * prime1 + prime4;
}
Expand All @@ -314,13 +314,13 @@ struct XXHash_64 {
// remaining data can be processed in 4-byte chunks
if constexpr (((nbytes % 32) % 8) >= 4) {
for (; offset <= nbytes - 4; offset += 4) {
h64 ^= (blocks4[offset / 4] & 0xFFFFFFFFULL) * prime1;
h64 ^= (blocks4[offset / 4] & 0xFFFFFFFFull) * prime1;
sleeepyjack marked this conversation as resolved.
Show resolved Hide resolved
h64 = rotl(h64, 23) * prime2 + prime3;
}
}

// the following loop is only needed if the size of the key is no multiple of a previous block
// size
// the following loop is only needed if the size of the key is not a multiple of a previous
// block size
if constexpr (nbytes % 4) {
while (offset < nbytes) {
h64 += (bytes[offset] & 0xFF) * prime5;
Expand All @@ -332,13 +332,13 @@ struct XXHash_64 {
}

private:
constexpr __host__ __device__ uint64_t rotl(uint64_t h, int8_t r) const noexcept
constexpr __host__ __device__ std::uint64_t rotl(std::uint64_t h, std::int8_t r) const noexcept
{
return ((h << r) | (h >> (64 - r)));
}

// avalanche helper
constexpr __host__ __device__ uint64_t finalize(uint64_t h) const noexcept
constexpr __host__ __device__ std::uint64_t finalize(std::uint64_t h) const noexcept
{
h ^= h >> 33;
h *= prime2;
Expand All @@ -348,7 +348,7 @@ struct XXHash_64 {
return h;
}

uint64_t seed_;
std::uint64_t seed_;
};

} // namespace cuco::detail
Loading