NVIDIA · sleeepyjack · Feb 7, 2025 · Feb 7, 2025 · Feb 12, 2025 · Feb 12, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,11 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
-#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cstdint>
 #include <exception>
@@ -61,25 +60,19 @@ void bloom_filter_add(nvbench::state& state,
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 
@@ -106,25 +99,19 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
 #include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <exception>
 
@@ -63,28 +63,22 @@ void bloom_filter_contains(
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 
@@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 

@@ -30,7 +30,7 @@ using BF_KEY  = nvbench::int64_t;
 using BF_HASH = cuco::xxhash_64<char>;
 using BF_WORD = nvbench::uint32_t;
 
-static constexpr auto BF_N               = 400'000'000;
+static constexpr auto BF_N               = 1'000'000'000;
 static constexpr auto BF_SIZE_MB         = 2'000;
 static constexpr auto BF_WORDS_PER_BLOCK = 8;
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,6 +134,22 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   __device__ void add(CG const& group, ProbeKey const& key);
 
+  /**
+   * @brief Device function that adds all keys in the range `[first, last)` to the filter.
+   *
+   * @note Best performance is achieved if the size of the CG is larger than or equal to
+   * `words_per_block`.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   */
+  template <class CG, class InputIt>
+  __device__ void add(CG const& group, InputIt first, InputIt last);
+
   /**
    * @brief Adds all keys in the range `[first, last)` to the filter.
    *
@@ -241,10 +257,26 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  /**
+   * @brief Device function that tests keys in the range `[first, last)` are present in filter.
+   *
+   * @note Best performance is achieved if the size of the CG is larger than or equal to
+   * `(words_per_block * sizeof(word_type)) / 32`.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   * @tparam OutputIt Device-accessible output iterator assignable from `bool`
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   */
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG const& group,
+                           InputIt first,
+                           InputIt last,
+                           OutputIt output_begin) const;
 
   /**
    * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,10 +83,10 @@ namespace cuco::detail {
 template <class Key, template <typename> class XXHash64>
 class arrow_filter_policy {
  public:
-  using hasher          = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type       = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type        = Key;            ///< Hash function input type
-  using hash_value_type = std::uint64_t;  ///< hash function output type
+  using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
+  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
+  using key_type         = Key;            ///< Hash function input type
+  using hash_result_type = std::uint64_t;  ///< hash function output type
 
   static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
   static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
@@ -99,21 +99,6 @@ class arrow_filter_policy {
     (max_arrow_filter_bytes /
      bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
 
- private:
-  // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate
-  // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word.
-  __device__ static constexpr cuda::std::array<std::uint32_t, 8> SALT()
-  {
-    return {0x47b6137bU,
-            0x44974d91U,
-            0x8824ad5bU,
-            0xa2b7289dU,
-            0x705495c7U,
-            0x2df1424bU,
-            0x9efc4947U,
-            0x5c6bfb31U};
-  }
-
  public:
   /**
    * @brief Constructs the `arrow_filter_policy` object.
@@ -133,7 +118,7 @@ class arrow_filter_policy {
    *
    * @return The hash value of the key
    */
-  __device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); }
+  __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }
 
   /**
    * @brief Determines the filter block a key is added into.
@@ -150,7 +135,7 @@ class arrow_filter_policy {
    * @return The block index for the given key's hash value
    */
   template <class Extent>
-  __device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const
+  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
   {
     constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
     // TODO: assert if num_blocks > max_filter_blocks
@@ -168,12 +153,33 @@ class arrow_filter_policy {
    *
    * @return The bit pattern for the word/segment in the filter block
    */
-  __device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const
+  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
   {
-    // SALT array to calculate bit indexes for the current word
-    auto constexpr salt = SALT();
     word_type const key = static_cast<word_type>(hash);
-    return word_type{1} << ((key * salt[word_index]) >> 27);
+    std::uint32_t salt;
+
+    // Basically a switch (word_index) { case 0-7 ... }
+    // First split: 0..3 versus 4..7.
+    if (word_index < 4) {
+      // For indices 0..3, further split into 0..1 and 2..3.
+      if (word_index < 2) {
+        // word_index is 0 or 1.
+        salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
+      } else {
+        // word_index is 2 or 3.
+        salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
+      }
+    } else {
+      // For indices 4..7, further split into 4..5 and 6..7.
+      if (word_index < 6) {
+        // word_index is 4 or 5.
+        salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
+      } else {
+        // word_index is 6 or 7.
+        salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
+      }
+    }
+    return word_type{1} << ((key * salt) >> 27);
   }
 
  private: