Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 6 additions & 19 deletions benchmarks/bloom_filter/add_bench.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,12 +21,11 @@
#include <benchmark_utils.hpp>

#include <cuco/bloom_filter.cuh>
#include <cuco/utility/key_generator.cuh>

#include <nvbench/nvbench.cuh>

#include <cuda/std/limits>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>

#include <cstdint>
#include <exception>
Expand Down Expand Up @@ -61,25 +60,19 @@ void bloom_filter_add(nvbench::state& state,
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
thrust::counting_iterator<Key> keys(0);

state.add_element_count(num_keys);

filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

state.exec([&](nvbench::launch& launch) {
filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
filter.add_async(keys, keys + num_keys, {launch.get_stream()});
});
}

Expand All @@ -106,25 +99,19 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
// configurations
}

thrust::device_vector<Key> keys(num_keys);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
thrust::counting_iterator<Key> keys(0);

state.add_element_count(num_keys);

filter_type filter{num_sub_filters};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

state.exec([&](nvbench::launch& launch) {
filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
filter.add_async(keys, keys + num_keys, {launch.get_stream()});
});
}

Expand Down
28 changes: 8 additions & 20 deletions benchmarks/bloom_filter/contains_bench.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -21,12 +21,12 @@
#include <benchmark_utils.hpp>

#include <cuco/bloom_filter.cuh>
#include <cuco/utility/key_generator.cuh>

#include <nvbench/nvbench.cuh>

#include <cuda/std/limits>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>

#include <exception>

Expand Down Expand Up @@ -63,28 +63,22 @@ void bloom_filter_contains(
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

thrust::device_vector<Key> keys(num_keys);
thrust::counting_iterator<Key> keys(0);
thrust::device_vector<bool> result(num_keys, false);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

filter.add(keys.begin(), keys.end());
filter.add(keys, keys + num_keys);

state.exec([&](nvbench::launch& launch) {
filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
});
}

Expand Down Expand Up @@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
// configurations
}

thrust::device_vector<Key> keys(num_keys);
thrust::counting_iterator<Key> keys(0);
thrust::device_vector<bool> result(num_keys, false);

key_generator gen;
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);

filter_type filter{num_sub_filters};

state.collect_dram_throughput();
state.collect_l1_hit_rates();
state.collect_l2_hit_rates();
state.collect_loads_efficiency();
state.collect_stores_efficiency();

add_fpr_summary(state, filter);

filter.add(keys.begin(), keys.end());
filter.add(keys, keys + num_keys);

state.exec([&](nvbench::launch& launch) {
filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
});
}

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bloom_filter/defaults.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ using BF_KEY = nvbench::int64_t;
using BF_HASH = cuco::xxhash_64<char>;
using BF_WORD = nvbench::uint32_t;

static constexpr auto BF_N = 400'000'000;
static constexpr auto BF_N = 1'000'000'000;
static constexpr auto BF_SIZE_MB = 2'000;
static constexpr auto BF_WORDS_PER_BLOCK = 8;

Expand Down
42 changes: 37 additions & 5 deletions include/cuco/bloom_filter_ref.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -134,6 +134,22 @@ class bloom_filter_ref {
template <class CG, class ProbeKey>
__device__ void add(CG const& group, ProbeKey const& key);

/**
* @brief Device function that adds all keys in the range `[first, last)` to the filter.
*
* @note Best performance is achieved if the size of the CG is larger than or equal to
* `words_per_block`.
*
* @tparam CG Cooperative Group type
* @tparam InputIt Device-accessible random access input key iterator
*
* @param group The Cooperative Group this operation is executed with
* @param first Beginning of the sequence of keys
* @param last End of the sequence of keys
*/
template <class CG, class InputIt>
__device__ void add(CG const& group, InputIt first, InputIt last);

/**
* @brief Adds all keys in the range `[first, last)` to the filter.
*
Expand Down Expand Up @@ -241,10 +257,26 @@ class bloom_filter_ref {
template <class CG, class ProbeKey>
[[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;

// TODO
// template <class CG, class InputIt, class OutputIt>
// __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
// const;
/**
* @brief Device function that tests keys in the range `[first, last)` are present in filter.
*
* @note Best performance is achieved if the size of the CG is larger than or equal to
* `(words_per_block * sizeof(word_type)) / 32`.
*
* @tparam CG Cooperative Group type
* @tparam InputIt Device-accessible random access input key iterator
* @tparam OutputIt Device-accessible output iterator assignable from `bool`
*
* @param group The Cooperative Group this operation is executed with
* @param first Beginning of the sequence of keys
* @param last End of the sequence of keys
* @param output_begin Beginning of the sequence of booleans for the presence of each key
*/
template <class CG, class InputIt, class OutputIt>
__device__ void contains(CG const& group,
InputIt first,
InputIt last,
OutputIt output_begin) const;

/**
* @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the
Expand Down
58 changes: 32 additions & 26 deletions include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -83,10 +83,10 @@ namespace cuco::detail {
template <class Key, template <typename> class XXHash64>
class arrow_filter_policy {
public:
using hasher = XXHash64<Key>; ///< 64-bit XXHash hasher for Arrow bloom filter policy
using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy
using key_type = Key; ///< Hash function input type
using hash_value_type = std::uint64_t; ///< hash function output type
using hasher = XXHash64<Key>; ///< 64-bit XXHash hasher for Arrow bloom filter policy
using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy
using key_type = Key; ///< Hash function input type
using hash_result_type = std::uint64_t; ///< hash function output type

static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block
static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block
Expand All @@ -99,21 +99,6 @@ class arrow_filter_policy {
(max_arrow_filter_bytes /
bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter

private:
// Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate
// eight indexes of bit to set, one bit in each 32-bit (uint32_t) word.
__device__ static constexpr cuda::std::array<std::uint32_t, 8> SALT()
{
return {0x47b6137bU,
0x44974d91U,
0x8824ad5bU,
0xa2b7289dU,
0x705495c7U,
0x2df1424bU,
0x9efc4947U,
0x5c6bfb31U};
}

public:
/**
* @brief Constructs the `arrow_filter_policy` object.
Expand All @@ -133,7 +118,7 @@ class arrow_filter_policy {
*
* @return The hash value of the key
*/
__device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); }
__device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }

/**
* @brief Determines the filter block a key is added into.
Expand All @@ -150,7 +135,7 @@ class arrow_filter_policy {
* @return The block index for the given key's hash value
*/
template <class Extent>
__device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const
__device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
{
constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
// TODO: assert if num_blocks > max_filter_blocks
Expand All @@ -168,12 +153,33 @@ class arrow_filter_policy {
*
* @return The bit pattern for the word/segment in the filter block
*/
__device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const
__device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
{
// SALT array to calculate bit indexes for the current word
auto constexpr salt = SALT();
word_type const key = static_cast<word_type>(hash);
return word_type{1} << ((key * salt[word_index]) >> 27);
std::uint32_t salt;

// Basically a switch (word_index) { case 0-7 ... }
// First split: 0..3 versus 4..7.
if (word_index < 4) {
// For indices 0..3, further split into 0..1 and 2..3.
if (word_index < 2) {
// word_index is 0 or 1.
salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
} else {
// word_index is 2 or 3.
salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
}
} else {
// For indices 4..7, further split into 4..5 and 6..7.
if (word_index < 6) {
// word_index is 4 or 5.
salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
} else {
// word_index is 6 or 7.
salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
}
}
return word_type{1} << ((key * salt) >> 27);
}

private:
Expand Down
Loading
Loading