Make legacy multimap thread safe

NVIDIA · Aug 5, 2024 · 02d36a5 · 02d36a5
1 parent bdb7fba
commit 02d36a5
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 96 deletions.
diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuco/detail/storage/counter_storage.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/detail/utils.cuh>
 
@@ -315,12 +316,9 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::static_multimap(
       capacity)},
     empty_key_sentinel_{empty_key_sentinel.value},
     empty_value_sentinel_{empty_value_sentinel.value},
-    counter_allocator_{alloc},
-    slot_allocator_{alloc},
-    delete_counter_{counter_allocator_},
-    delete_slots_{slot_allocator_, capacity_},
-    d_counter_{counter_allocator_.allocate(1), delete_counter_},
-    slots_{slot_allocator_.allocate(capacity_), delete_slots_}
+    allocator_{alloc},
+    delete_slots_{allocator_, capacity_},
+    slots_{allocator_.allocate(capacity_), delete_slots_}
 {
   auto constexpr block_size = 128;
   auto constexpr stride     = 4;
@@ -441,16 +439,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);
 
-  return h_counter;
+  return counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -472,16 +467,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+    <<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);
 
-  return h_counter;
+  return counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -503,16 +495,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);
 
-  return h_counter;
+  return counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -534,16 +523,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
   auto view            = get_device_view();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::pair_count<block_size, cg_size(), is_outer>
-    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+    <<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);
 
-  return h_counter;
+  return counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -570,19 +556,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(
 
   auto const grid_size = detail::grid_size(num_keys, cg_size());
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
     <<<grid_size, detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, d_counter_.get(), view, key_equal);
-
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+      first, num_keys, output_begin, counter.data(), view, key_equal);
 
-  auto output_end = output_begin + h_counter;
-  return output_end;
+  return output_begin + counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -609,19 +590,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_
 
   auto const grid_size = detail::grid_size(num_keys, cg_size());
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
     <<<grid_size, detail::default_block_size(), 0, stream>>>(
-      first, num_keys, output_begin, d_counter_.get(), view, key_equal);
+      first, num_keys, output_begin, counter.data(), view, key_equal);
 
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  auto output_end = output_begin + h_counter;
-  return output_end;
+  return output_begin + counter.load_to_host(stream);
 }
 
 template <typename Key,
@@ -655,23 +631,20 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
     <<<grid_size, block_size, 0, stream>>>(first,
                                            num_pairs,
                                            probe_output_begin,
                                            contained_output_begin,
-                                           d_counter_.get(),
+                                           counter.data(),
                                            view,
                                            pair_equal);
 
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  return std::make_pair(probe_output_begin + h_counter, contained_output_begin + h_counter);
+  auto const h_count = counter.load_to_host(stream);
+  return {probe_output_begin + h_count, contained_output_begin + h_count};
 }
 
 template <typename Key,
@@ -705,23 +678,20 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
   }();
   auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);
 
-  CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
-  std::size_t h_counter;
+  auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
+  counter.reset(stream);
 
   detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
     <<<grid_size, block_size, 0, stream>>>(first,
                                            num_pairs,
                                            probe_output_begin,
                                            contained_output_begin,
-                                           d_counter_.get(),
+                                           counter.data(),
                                            view,
                                            pair_equal);
 
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    &h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
-  CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
-
-  return std::make_pair(probe_output_begin + h_counter, contained_output_begin + h_counter);
+  auto const h_count = counter.load_to_host(stream);
+  return {probe_output_begin + h_count, contained_output_begin + h_count};
 }
 
 template <typename Key,

diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
@@ -510,17 +510,14 @@ class static_multimap {
   using value_type         = cuco::pair<Key, Value>;            ///< Type of key/value pairs
   using key_type           = Key;                               ///< Key type
   using mapped_type        = Value;                             ///< Type of mapped values
+  using size_type          = std::size_t;                       ///< Size type
   using atomic_key_type    = cuda::atomic<key_type, Scope>;     ///< Type of atomic keys
   using atomic_mapped_type = cuda::atomic<mapped_type, Scope>;  ///< Type of atomic mapped values
   using pair_atomic_type =
     cuco::pair<atomic_key_type,
                atomic_mapped_type>;  ///< Pair type of atomic key and atomic mapped value
-  using atomic_ctr_type     = cuda::atomic<std::size_t, Scope>;  ///< Atomic counter type
-  using allocator_type      = Allocator;                         ///< Allocator type
-  using slot_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
     pair_atomic_type>;  ///< Type of the allocator to (de)allocate slots
-  using counter_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
-    atomic_ctr_type>;  ///< Type of the allocator to (de)allocate atomic counters
   using probe_sequence_type =
     cuco::legacy::detail::probe_sequence<ProbeSequence, Key, Value, Scope>;  ///< Probe scheme type
 
@@ -923,30 +920,17 @@ class static_multimap {
    */
   static constexpr uint32_t warp_size() noexcept { return 32u; }
 
-  /**
-   * @brief Custom deleter for unique pointer of device counter.
-   */
-  struct counter_deleter {
-    counter_deleter(counter_allocator_type& a) : allocator{a} {}
-
-    counter_deleter(counter_deleter const&) = default;
-
-    void operator()(atomic_ctr_type* ptr) { allocator.deallocate(ptr, 1); }
-
-    counter_allocator_type& allocator;
-  };
-
   /**
    * @brief Custom deleter for unique pointer of slots.
    */
   struct slot_deleter {
-    slot_deleter(slot_allocator_type& a, size_t& c) : allocator{a}, capacity{c} {}
+    slot_deleter(allocator_type& a, size_t& c) : allocator{a}, capacity{c} {}
 
     slot_deleter(slot_deleter const&) = default;
 
     void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity); }
 
-    slot_allocator_type& allocator;
+    allocator_type& allocator;
     size_t& capacity;
   };
 
@@ -1699,14 +1683,11 @@ class static_multimap {
   }
 
  private:
-  std::size_t capacity_{};                      ///< Total number of slots
-  Key empty_key_sentinel_{};                    ///< Key value that represents an empty slot
-  Value empty_value_sentinel_{};                ///< Initial value of empty slot
-  slot_allocator_type slot_allocator_{};        ///< Allocator used to allocate slots
-  counter_allocator_type counter_allocator_{};  ///< Allocator used to allocate counters
-  counter_deleter delete_counter_;              ///< Custom counter deleter
-  slot_deleter delete_slots_;                   ///< Custom slots deleter
-  std::unique_ptr<atomic_ctr_type, counter_deleter> d_counter_{};  ///< Preallocated device counter
+  std::size_t capacity_{};        ///< Total number of slots
+  Key empty_key_sentinel_{};      ///< Key value that represents an empty slot
+  Value empty_value_sentinel_{};  ///< Initial value of empty slot
+  allocator_type allocator_{};    ///< Allocator used to allocate slots
+  slot_deleter delete_slots_;     ///< Custom slots deleter
   std::unique_ptr<pair_atomic_type, slot_deleter> slots_{};  ///< Pointer to flat slots storage
 };  // class static_multimap