-
Notifications
You must be signed in to change notification settings - Fork 100
Priority Queue #105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
andrewbriand
wants to merge
56
commits into
NVIDIA:dev
Choose a base branch
from
andrewbriand:dev
base: dev
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+2,215
−0
Open
Priority Queue #105
Changes from 26 commits
Commits
Show all changes
56 commits
Select commit
Hold shift + click to select a range
5ab856e
Initial priority queue commit
1f2092c
Add priority queue benchmark
6a9dc99
Class comment
6b263e3
Improve comments and switch to cuco style
0eaaedf
Iterators
249165c
Test for iterators with thrust device_vector
c28a5ad
Add allocator template parameter
e8a9c1e
Allocator
andrewbriand 012ebde
Accept arbitrary comparison
andrewbriand 8cf681a
Accept arbitrary types instead of just pairs
andrewbriand 8485bec
Remove pq_pair.h
andrewbriand da608cc
Start porting priority queue benchmark to gbenchmark
andrewbriand 8a11b7f
Finish porting priority queue benchmark to gbenchmark
andrewbriand d1392b9
Add multiple node sizes to benchmark
andrewbriand 9ee6c8b
Start porting tests to Catch2
andrewbriand e223598
Prevent block size from being larger than node size
andrewbriand dd8c6b7
Continue porting tests to Catch2
andrewbriand d031519
Make generate_element for KVPair generic
andrewbriand ba3a6fd
Finish Catch2 tests
andrewbriand 16db085
Hide kernel launch details
andrewbriand 052cec0
Clean up partial deletion code
andrewbriand a11bea5
Correct test comparisons
andrewbriand e3c4a27
Commenting and cleanup
andrewbriand f6fa484
Commenting for Compare
andrewbriand 599067f
Cleanup, arbitrary number of elements for device API functions
andrewbriand 44db340
Formatting
andrewbriand acfdf7e
Add missing syncs
andrewbriand d870e29
Merge NVIDIA:dev into andrewbriand:dev
andrewbriand 71775b6
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 9838569
Add copyright to priority_queue_bench.cu
andrewbriand aab4ba0
Add copyright to priority queue files
andrewbriand 0196bde
Order headers from near to far in priority queue files
andrewbriand 4af61ca
Bug fix in priority queue test code
andrewbriand a1d074a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] bf930dd
Remove unnecessary allocator
andrewbriand 2d9bda9
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 54dc9f3
Add missing member docs in priority_queue.cuh
andrewbriand a5c169d
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 4269e9c
Add stream parameter to priority queue ctor
andrewbriand 30cbf83
Snake case in priority queue files
andrewbriand bec63f3
Put priority queue kernels in detail namespace
andrewbriand aa12404
generate_keys_uniform -> generate_kv_pairs_uniform
andrewbriand 55cf2e6
Remove FavorInsertionPerformance template parameter
andrewbriand f4814db
Default node size 64 -> 1024
andrewbriand 89eea18
Avoid c-style expressions in priority queue files
andrewbriand 7d47200
Remove FavorInsertionPerformance in priority queue benchmark
andrewbriand 007316a
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 192e263
Snake case in priority_queue_test.cu
andrewbriand 66dd359
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 9da822f
kPBufferIdx -> p_buffer_idx and kRootIdx -> root_idx
andrewbriand 0cfdd94
Use const and constexpr wherever possible in priority queue files
andrewbriand 828b00b
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] 1932418
Add missing const in priority queue
andrewbriand 7c4b1f6
Add docs for stream parameter to priority queue ctor
andrewbriand 838e4ea
Add value_type to priority_queue::device_mutable_view
andrewbriand d58dd9f
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| #include <vector> | ||
| #include <cstdint> | ||
| #include <random> | ||
|
|
||
| #include <benchmark/benchmark.h> | ||
|
|
||
| #include <cuco/priority_queue.cuh> | ||
| #include <cuco/detail/pair.cuh> | ||
|
|
||
| #include <thrust/device_vector.h> | ||
|
|
||
| using namespace cuco; | ||
|
|
||
| template <typename T> | ||
| struct pair_less { | ||
| __host__ __device__ bool operator()(const T& a, const T& b) const { | ||
| return a.first < b.first; | ||
| } | ||
| }; | ||
|
|
||
| template<typename Key, typename Value, typename OutputIt> | ||
| static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { | ||
| std::random_device rd; | ||
| std::mt19937 gen{rd()}; | ||
|
|
||
| auto num_keys = std::distance(output_begin, output_end); | ||
|
|
||
| for (auto i = 0; i < num_keys; ++i) { | ||
| output_begin[i] = {static_cast<Key>(gen()), static_cast<Value>(gen())}; | ||
| } | ||
| } | ||
|
|
||
| template <typename Key, typename Value, int NumKeys, | ||
| bool FavorInsertionPerformance> | ||
| static void BM_insert(::benchmark::State& state) | ||
| { | ||
| for (auto _ : state) { | ||
| state.PauseTiming(); | ||
|
|
||
| priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>, | ||
| FavorInsertionPerformance> pq(NumKeys); | ||
|
|
||
| std::vector<pair<Key, Value>> h_pairs(NumKeys); | ||
| generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end()); | ||
| thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs); | ||
|
|
||
| state.ResumeTiming(); | ||
| pq.push(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| template <typename Key, typename Value, int NumKeys, | ||
| bool FavorInsertionPerformance> | ||
| static void BM_delete(::benchmark::State& state) | ||
| { | ||
| for (auto _ : state) { | ||
| state.PauseTiming(); | ||
|
|
||
| priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>, | ||
| FavorInsertionPerformance> pq(NumKeys); | ||
|
|
||
| std::vector<pair<Key, Value>> h_pairs(NumKeys); | ||
| generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end()); | ||
| thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs); | ||
|
|
||
| pq.push(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
|
|
||
| state.ResumeTiming(); | ||
| pq.pop(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| #pragma once | ||
| #include <cmath> | ||
|
|
||
| #include <cuco/detail/priority_queue_kernels.cuh> | ||
| #include <cuco/detail/error.hpp> | ||
|
|
||
| namespace cuco { | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::priority_queue | ||
| (size_t initial_capacity, | ||
| Allocator const& allocator) : | ||
| allocator_{allocator}, | ||
| int_allocator_{allocator}, | ||
| t_allocator_{allocator}, | ||
| size_t_allocator_{allocator} { | ||
|
|
||
| node_size_ = NodeSize; | ||
|
|
||
| // Round up to the nearest multiple of node size | ||
| int nodes = ((initial_capacity + node_size_ - 1) / node_size_); | ||
|
|
||
| node_capacity_ = nodes; | ||
| lowest_level_start_ = 1 << (int)log2(nodes); | ||
|
|
||
| // Allocate device variables | ||
|
|
||
| d_size_ = std::allocator_traits<int_allocator_type>::allocate(int_allocator_, | ||
| 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); | ||
|
|
||
| d_p_buffer_size_ = std::allocator_traits<size_t_allocator_type> | ||
| ::allocate(size_t_allocator_, 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); | ||
|
|
||
| d_heap_ = std::allocator_traits<t_allocator_type> | ||
| ::allocate(t_allocator_, | ||
| node_capacity_ * node_size_ + node_size_); | ||
|
|
||
| d_locks_ = std::allocator_traits<int_allocator_type> | ||
| ::allocate(int_allocator_, node_capacity_ + 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, | ||
| sizeof(int) * (node_capacity_ + 1))); | ||
|
|
||
|
|
||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::~priority_queue() { | ||
| std::allocator_traits<int_allocator_type>::deallocate(int_allocator_, | ||
| d_size_, 1); | ||
| std::allocator_traits<size_t_allocator_type>::deallocate(size_t_allocator_, | ||
| d_p_buffer_size_, 1); | ||
| std::allocator_traits<t_allocator_type>::deallocate(t_allocator_, | ||
| d_heap_, | ||
| node_capacity_ * node_size_ + node_size_); | ||
| std::allocator_traits<int_allocator_type>::deallocate(int_allocator_, | ||
| d_locks_, | ||
| node_capacity_ + 1); | ||
| } | ||
|
|
||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename InputIt> | ||
| void priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::push(InputIt first, | ||
| InputIt last, | ||
| cudaStream_t stream) { | ||
|
|
||
| const int kBlockSize = min(256, (int)node_size_); | ||
PointKernel marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| const int kNumBlocks = min(64000, | ||
| max(1, (int)((last - first) / node_size_))); | ||
|
|
||
| PushKernel<<<kNumBlocks, kBlockSize, | ||
| get_shmem_size(kBlockSize), stream>>> | ||
| (first, last - first, d_heap_, d_size_, | ||
| node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, | ||
| compare_); | ||
|
|
||
| CUCO_CUDA_TRY(cudaGetLastError()); | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename OutputIt> | ||
| void priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::pop(OutputIt first, | ||
| OutputIt last, | ||
| cudaStream_t stream) { | ||
|
|
||
| int pop_size = last - first; | ||
| const int partial = pop_size % node_size_; | ||
|
|
||
| const int kBlockSize = min(256, (int)node_size_); | ||
| const int kNumBlocks = min(64000, | ||
| max(1, (int)((pop_size - partial) / node_size_))); | ||
|
|
||
| PopKernel<<<kNumBlocks, kBlockSize, | ||
| get_shmem_size(kBlockSize), stream>>> | ||
| (first, pop_size, d_heap_, d_size_, | ||
| node_size_, d_locks_, d_p_buffer_size_, | ||
| lowest_level_start_, node_capacity_, compare_); | ||
|
|
||
| CUCO_CUDA_TRY(cudaGetLastError()); | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename CG, typename InputIt> | ||
| __device__ void priority_queue<T, Compare, | ||
| FavorInsertionPerformance, Allocator> | ||
| ::device_mutable_view::push( | ||
| CG const& g, | ||
| InputIt first, | ||
| InputIt last, | ||
| void *temp_storage) { | ||
|
|
||
| SharedMemoryLayout<T> shmem = | ||
| GetSharedMemoryLayout<T>((int*)temp_storage, | ||
| g.size(), node_size_); | ||
|
|
||
| auto push_size = last - first; | ||
| for (size_t i = 0; i < push_size / node_size_; i++) { | ||
| PushSingleNode(g, first + i * node_size_, d_heap_, d_size_, node_size_, | ||
| d_locks_, lowest_level_start_, shmem, compare_); | ||
| } | ||
|
|
||
| if (push_size % node_size_ != 0) { | ||
| PushPartialNode(g, first + (push_size / node_size_) * node_size_, | ||
| push_size % node_size_, d_heap_, | ||
| d_size_, node_size_, d_locks_, | ||
| d_p_buffer_size_, lowest_level_start_, shmem, | ||
| compare_); | ||
| } | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename CG, typename OutputIt> | ||
| __device__ void priority_queue<T, Compare, | ||
| FavorInsertionPerformance, Allocator> | ||
| ::device_mutable_view::pop( | ||
| CG const& g, | ||
| OutputIt first, | ||
| OutputIt last, | ||
| void *temp_storage) { | ||
| SharedMemoryLayout<T> shmem = | ||
| GetSharedMemoryLayout<T>((int*)temp_storage, | ||
| g.size(), node_size_); | ||
|
|
||
| auto pop_size = last - first; | ||
| for (size_t i = 0; i < pop_size / node_size_; i++) { | ||
| PopSingleNode(g, first + i * node_size_, | ||
| d_heap_, d_size_, node_size_, d_locks_, | ||
| d_p_buffer_size_, lowest_level_start_, | ||
| node_capacity_, shmem, compare_); | ||
| } | ||
|
|
||
| if (pop_size % node_size_ != 0) { | ||
| PopPartialNode(g, first + (pop_size / node_size_) * node_size_, | ||
| last - first, d_heap_, d_size_, node_size_, | ||
| d_locks_, d_p_buffer_size_, lowest_level_start_, | ||
| node_capacity_, shmem, compare_); | ||
| } | ||
| } | ||
|
|
||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.