Skip to content

Commit

Permalink
Make legacy multimap thread safe
Browse files Browse the repository at this point in the history
  • Loading branch information
PointKernel committed Aug 5, 2024
1 parent bdb7fba commit 02d36a5
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 96 deletions.
106 changes: 38 additions & 68 deletions include/cuco/detail/static_multimap/static_multimap.inl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include <cuco/detail/storage/counter_storage.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/detail/utils.cuh>

Expand Down Expand Up @@ -315,12 +316,9 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::static_multimap(
capacity)},
empty_key_sentinel_{empty_key_sentinel.value},
empty_value_sentinel_{empty_value_sentinel.value},
counter_allocator_{alloc},
slot_allocator_{alloc},
delete_counter_{counter_allocator_},
delete_slots_{slot_allocator_, capacity_},
d_counter_{counter_allocator_.allocate(1), delete_counter_},
slots_{slot_allocator_.allocate(capacity_), delete_slots_}
allocator_{alloc},
delete_slots_{allocator_, capacity_},
slots_{allocator_.allocate(capacity_), delete_slots_}
{
auto constexpr block_size = 128;
auto constexpr stride = 4;
Expand Down Expand Up @@ -441,16 +439,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count(
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::count<block_size, cg_size(), is_outer>
<<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
<<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);

return h_counter;
return counter.load_to_host(stream);
}

template <typename Key,
Expand All @@ -472,16 +467,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::count_
auto view = get_device_view();
auto const grid_size = (cg_size() * num_keys + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::count<block_size, cg_size(), is_outer>
<<<grid_size, block_size, 0, stream>>>(first, num_keys, d_counter_.get(), view, key_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
<<<grid_size, block_size, 0, stream>>>(first, num_keys, counter.data(), view, key_equal);

return h_counter;
return counter.load_to_host(stream);
}

template <typename Key,
Expand All @@ -503,16 +495,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::pair_count<block_size, cg_size(), is_outer>
<<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
<<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);

return h_counter;
return counter.load_to_host(stream);
}

template <typename Key,
Expand All @@ -534,16 +523,13 @@ std::size_t static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_c
auto view = get_device_view();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::pair_count<block_size, cg_size(), is_outer>
<<<grid_size, block_size, 0, stream>>>(first, num_pairs, d_counter_.get(), view, pair_equal);
CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
<<<grid_size, block_size, 0, stream>>>(first, num_pairs, counter.data(), view, pair_equal);

return h_counter;
return counter.load_to_host(stream);
}

template <typename Key,
Expand All @@ -570,19 +556,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve(

auto const grid_size = detail::grid_size(num_keys, cg_size());

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
<<<grid_size, detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, d_counter_.get(), view, key_equal);

CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
first, num_keys, output_begin, counter.data(), view, key_equal);

auto output_end = output_begin + h_counter;
return output_end;
return output_begin + counter.load_to_host(stream);
}

template <typename Key,
Expand All @@ -609,19 +590,14 @@ OutputIt static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::retrieve_

auto const grid_size = detail::grid_size(num_keys, cg_size());

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::retrieve<detail::default_block_size(), flushing_cg_size, cg_size(), buffer_size, is_outer>
<<<grid_size, detail::default_block_size(), 0, stream>>>(
first, num_keys, output_begin, d_counter_.get(), view, key_equal);
first, num_keys, output_begin, counter.data(), view, key_equal);

CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

auto output_end = output_begin + h_counter;
return output_end;
return output_begin + counter.load_to_host(stream);
}

template <typename Key,
Expand Down Expand Up @@ -655,23 +631,20 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve(
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
<<<grid_size, block_size, 0, stream>>>(first,
num_pairs,
probe_output_begin,
contained_output_begin,
d_counter_.get(),
counter.data(),
view,
pair_equal);

CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

return std::make_pair(probe_output_begin + h_counter, contained_output_begin + h_counter);
auto const h_count = counter.load_to_host(stream);
return {probe_output_begin + h_count, contained_output_begin + h_count};
}

template <typename Key,
Expand Down Expand Up @@ -705,23 +678,20 @@ static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::pair_retrieve_oute
}();
auto const grid_size = (cg_size() * num_pairs + stride * block_size - 1) / (stride * block_size);

CUCO_CUDA_TRY(cudaMemsetAsync(d_counter_.get(), 0, sizeof(atomic_ctr_type), stream));
std::size_t h_counter;
auto counter = detail::counter_storage<size_type, Scope, allocator_type>{allocator_};
counter.reset(stream);

detail::pair_retrieve<block_size, flushing_cg_size, cg_size(), buffer_size, is_outer>
<<<grid_size, block_size, 0, stream>>>(first,
num_pairs,
probe_output_begin,
contained_output_begin,
d_counter_.get(),
counter.data(),
view,
pair_equal);

CUCO_CUDA_TRY(cudaMemcpyAsync(
&h_counter, d_counter_.get(), sizeof(atomic_ctr_type), cudaMemcpyDeviceToHost, stream));
CUCO_CUDA_TRY(cudaStreamSynchronize(stream));

return std::make_pair(probe_output_begin + h_counter, contained_output_begin + h_counter);
auto const h_count = counter.load_to_host(stream);
return {probe_output_begin + h_count, contained_output_begin + h_count};
}

template <typename Key,
Expand Down
37 changes: 9 additions & 28 deletions include/cuco/static_multimap.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -510,17 +510,14 @@ class static_multimap {
using value_type = cuco::pair<Key, Value>; ///< Type of key/value pairs
using key_type = Key; ///< Key type
using mapped_type = Value; ///< Type of mapped values
using size_type = std::size_t; ///< Size type
using atomic_key_type = cuda::atomic<key_type, Scope>; ///< Type of atomic keys
using atomic_mapped_type = cuda::atomic<mapped_type, Scope>; ///< Type of atomic mapped values
using pair_atomic_type =
cuco::pair<atomic_key_type,
atomic_mapped_type>; ///< Pair type of atomic key and atomic mapped value
using atomic_ctr_type = cuda::atomic<std::size_t, Scope>; ///< Atomic counter type
using allocator_type = Allocator; ///< Allocator type
using slot_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
pair_atomic_type>; ///< Type of the allocator to (de)allocate slots
using counter_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<
atomic_ctr_type>; ///< Type of the allocator to (de)allocate atomic counters
using probe_sequence_type =
cuco::legacy::detail::probe_sequence<ProbeSequence, Key, Value, Scope>; ///< Probe scheme type

Expand Down Expand Up @@ -923,30 +920,17 @@ class static_multimap {
*/
static constexpr uint32_t warp_size() noexcept { return 32u; }

/**
* @brief Custom deleter for unique pointer of device counter.
*/
struct counter_deleter {
counter_deleter(counter_allocator_type& a) : allocator{a} {}

counter_deleter(counter_deleter const&) = default;

void operator()(atomic_ctr_type* ptr) { allocator.deallocate(ptr, 1); }

counter_allocator_type& allocator;
};

/**
* @brief Custom deleter for unique pointer of slots.
*/
struct slot_deleter {
slot_deleter(slot_allocator_type& a, size_t& c) : allocator{a}, capacity{c} {}
slot_deleter(allocator_type& a, size_t& c) : allocator{a}, capacity{c} {}

slot_deleter(slot_deleter const&) = default;

void operator()(pair_atomic_type* ptr) { allocator.deallocate(ptr, capacity); }

slot_allocator_type& allocator;
allocator_type& allocator;
size_t& capacity;
};

Expand Down Expand Up @@ -1699,14 +1683,11 @@ class static_multimap {
}

private:
std::size_t capacity_{}; ///< Total number of slots
Key empty_key_sentinel_{}; ///< Key value that represents an empty slot
Value empty_value_sentinel_{}; ///< Initial value of empty slot
slot_allocator_type slot_allocator_{}; ///< Allocator used to allocate slots
counter_allocator_type counter_allocator_{}; ///< Allocator used to allocate counters
counter_deleter delete_counter_; ///< Custom counter deleter
slot_deleter delete_slots_; ///< Custom slots deleter
std::unique_ptr<atomic_ctr_type, counter_deleter> d_counter_{}; ///< Preallocated device counter
std::size_t capacity_{}; ///< Total number of slots
Key empty_key_sentinel_{}; ///< Key value that represents an empty slot
Value empty_value_sentinel_{}; ///< Initial value of empty slot
allocator_type allocator_{}; ///< Allocator used to allocate slots
slot_deleter delete_slots_; ///< Custom slots deleter
std::unique_ptr<pair_atomic_type, slot_deleter> slots_{}; ///< Pointer to flat slots storage
}; // class static_multimap

Expand Down

0 comments on commit 02d36a5

Please sign in to comment.