From 5ab856e8031996c5e85188e382ffd2eedf5fc0cc Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 9 Sep 2021 15:20:11 -0700 Subject: [PATCH 01/55] Initial priority queue commit --- include/cuco/detail/pq_pair.cuh | 21 + include/cuco/detail/priority_queue.inl | 158 ++ .../cuco/detail/priority_queue_kernels.cuh | 1348 +++++++++++++++++ include/cuco/priority_queue.cuh | 163 ++ tests/CMakeLists.txt | 4 + tests/priority_queue/priority_queue_test.cu | 1185 +++++++++++++++ 6 files changed, 2879 insertions(+) create mode 100644 include/cuco/detail/pq_pair.cuh create mode 100644 include/cuco/detail/priority_queue.inl create mode 100644 include/cuco/detail/priority_queue_kernels.cuh create mode 100644 include/cuco/priority_queue.cuh create mode 100644 tests/priority_queue/priority_queue_test.cu diff --git a/include/cuco/detail/pq_pair.cuh b/include/cuco/detail/pq_pair.cuh new file mode 100644 index 000000000..5edf31dfa --- /dev/null +++ b/include/cuco/detail/pq_pair.cuh @@ -0,0 +1,21 @@ +#pragma once + +namespace cuco { + +template +struct Pair { + Key key; + Value value; +}; + +/* +* Check if two Pairs have the same key and value +* @param a The first pair +* @param b The second pair +*/ +template +bool operator==(const Pair &a, const Pair &b) { + return a.key == b.key && a.value == b.value; +} + +} diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl new file mode 100644 index 000000000..d9cfa7f76 --- /dev/null +++ b/include/cuco/detail/priority_queue.inl @@ -0,0 +1,158 @@ +#pragma once +#include + +#include +#include + +namespace cuco { + +template +priority_queue::priority_queue(size_t initial_capacity, + size_t node_size) { + + node_size_ = node_size; + + // Round up to the nearest multiple of node size + int nodes = ((initial_capacity + node_size_ - 1) / node_size_); + + node_capacity_ = nodes; + lowest_level_start_ = 1 << (int)log2(nodes); + + // Allocate device variables + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_size_, sizeof(int))); + + CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_p_buffer_size_, sizeof(size_t))); + + CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_heap_, + sizeof(Pair) + * (node_capacity_ * node_size_ + node_size_))); + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_locks_, + sizeof(int) * (node_capacity_ + 1))); + + CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, + sizeof(int) * (node_capacity_ + 1))); + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_pop_tracker_, sizeof(int))); + +} + +template +priority_queue::~priority_queue() { + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_size_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_p_buffer_size_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_heap_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_locks_)); + CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_pop_tracker_)); +} + + +template +void priority_queue::push(Pair *elements, + size_t num_elements, + int block_size, + int grid_size, + bool warp_level, + cudaStream_t stream) { + + const int kBlockSize = block_size; + const int kNumBlocks = grid_size; + + if (!warp_level) { + PushKernel<<>> + (elements, num_elements, d_heap_, d_size_, + node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_); + } else { + PushKernelWarp<<>> + (elements, num_elements, d_heap_, d_size_, + node_size_, d_locks_, d_p_buffer_size_, + lowest_level_start_, get_shmem_size(32)); + } + + CUCO_CUDA_TRY(cudaGetLastError()); +} + +template +void priority_queue::pop(Pair *out, + size_t num_elements, + int block_size, + int grid_size, + bool warp_level, + cudaStream_t stream) { + + const int kBlockSize = block_size; + const int kNumBlocks = grid_size; + + cudaMemset(d_pop_tracker_, 0, sizeof(int)); + if (!warp_level) { + PopKernel<<>> + (out, num_elements, d_heap_, d_size_, + node_size_, d_locks_, d_p_buffer_size_, + d_pop_tracker_, lowest_level_start_, node_capacity_); + } else { + PopKernelWarp<<>> + (out, num_elements, d_heap_, d_size_, + node_size_, d_locks_, d_p_buffer_size_, + d_pop_tracker_, lowest_level_start_, + node_capacity_, get_shmem_size(32)); + + } + + CUCO_CUDA_TRY(cudaGetLastError()); +} + +template +template +__device__ void priority_queue::device_mutable_view::push( + CG const& g, + Pair *elements, + size_t num_elements, + void *temp_storage) { + + SharedMemoryLayout shmem = + GetSharedMemoryLayout((int*)temp_storage, + g.size(), node_size_); + if (num_elements == node_size_) { + PushSingleNode(g, elements, d_heap_, d_size_, node_size_, + d_locks_, lowest_level_start_, shmem); + } else if (num_elements < node_size_) { + PushPartialNode(g, elements, num_elements, d_heap_, + d_size_, node_size_, d_locks_, + d_p_buffer_size_, lowest_level_start_, shmem); + } +} + +template +template +__device__ void priority_queue::device_mutable_view::pop( + CG const& g, + Pair *out, + size_t num_elements, + void *temp_storage) { + int pop_tracker = 0; + + SharedMemoryLayout shmem = + GetSharedMemoryLayout((int*)temp_storage, + g.size(), node_size_); + + if (num_elements == node_size_) { + PopSingleNode(g, out, d_heap_, d_size_, node_size_, d_locks_, + d_p_buffer_size_, &pop_tracker, lowest_level_start_, + node_capacity_, shmem); + } else { + PopPartialNode(g, out, num_elements, d_heap_, d_size_, node_size_, + d_locks_, d_p_buffer_size_, lowest_level_start_, + node_capacity_, shmem); + } +} + +} diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh new file mode 100644 index 000000000..1d9886f68 --- /dev/null +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -0,0 +1,1348 @@ +#pragma once + +#include +#include +#include + +using namespace cooperative_groups; + +namespace cuco { + +constexpr int kPBufferIdx = 0; +constexpr int kRootIdx = 1; + +/* +* Struct to hold pointers to the temp storage used by the priority +* queue's kernels and functions. +* Ideally, this temp storage is in shared memory +*/ +template +struct SharedMemoryLayout { + int *intersections; + Pair *A; + Pair *B; +}; + +/* +* Get the shared memory layout for a given group dimension +* and node size. +* +* @param s Pointer to the beginning of the section of shared memory to +* partition +* @param dim Size of the cooperative group the memory will be used by +* @param node_size Size of the nodes in this priority queue +* @returns The memory layout for the given group dimension and node size +*/ +template +__device__ SharedMemoryLayout GetSharedMemoryLayout( + int *s, int dim, size_t node_size) { + + SharedMemoryLayout result; + result.intersections = s; + result.A = (Pair*)(s + 2 * (dim + 1)); + result.B = result.A + node_size; + return result; +} + +/** +* Acquires lock l for the current thread block +* The entire thread block must call the function +* +* @param g The cooperative group that will acquire the lock +* @param l Pointer to the lock to be acquired +*/ +template +__device__ void AcquireLock(CG const& g, int *l) { + if (g.thread_rank() == 0) { + while (atomicCAS(l, 0, 1) != 0); + } + __threadfence(); + g.sync(); +} + +/** +* Releases lock l for the current thread block +* +* @param g The cooperative group that will release the lock +* @param l Pointer to the lock to be released +*/ +template +__device__ void ReleaseLock(CG const& g, int *l) { + if (g.thread_rank() == 0) { + atomicExch(l, 0); + } +} + +/** +* Copy node_size pairs from src to dst +* +* @param g The cooperative group that will perform the copy +* @param dst Pointer to the beginning of the destination array +* @param src Pointer to the beginning of the source array +*/ +template +__device__ void CopyPairs(CG const& g, Pair *dst, + Pair *src, + size_t node_size) { + for (size_t i = g.thread_rank(); i < node_size; i += g.size()) { + dst[i] = src[i]; + } +} + +/* +* Compare two elements depending on whether this is a max or +* min queue. +* +* @param a The first element to be compared +* @param b The second element to be compared +* @returns If Max, returns true iff a.key >= b.key, +* otherwise returns true iff a.key <= b.key. +*/ +template +__device__ bool compare(const Pair& a, const Pair& b) { + if (Max) { + return a.key >= b.key; + } else { + return a.key <= b.key; + } +} + +/** +* Merge arrays a and b of size node_size by key, putting the +* node_size elements with the lowest keys in lo, sorted by key, and the +* node_size elements with the highest keys in hi, sorted by key +* +* @param g The cooperative group that will perform the merge and sort +* @param a The first array of pairs to be merged, sorted by key +* @param b The second array of pairs to be merged, sorted by key +* @param lo The array in which the node_size elements with the lowest keys +* will be placed when the merge is completed +* @param hi The array in which the node_size elements with the highest keys +* will be placed when the merge is completed +* @param node_size The size of arrays a, b, lo, and hi +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void MergeAndSort(CG const& g, + Pair *a, + Pair *b, + Pair *lo, + Pair *hi, + size_t node_size, + SharedMemoryLayout shmem) { + MergeAndSort(g, a, b, lo, hi, node_size, node_size, node_size, shmem); +} + +/** +* Merge array a of size num_elements_a and array b of size num_elements_b +* by key. If num_elements_a + num_elements_b <= node_size, all merged elements +* will be placed in lo. Otherwise, the node_size lowest merged elements will +* be placed in lo, and the rest of the elements will be placed in hi. +* +* @param g The cooperative group that will perform the merge and sort +* @param a The first array of pairs to be merged, sorted by key +* @param b The second array of pairs to be merged, sorted by key +* @param lo The array in which the node_size elements with the lowest keys +* will be placed when the merge is completed +* @param hi The array in which the node_size elements with the highest keys +* will be placed when the merge is completed, +* if num_elements_a + num_elements_b > node_size. May be nullptr in +* the case that num_elements_a + num_elements_b < node_size. +* @param num_elements_a The number of pairs in array a +* @param num_elements_b The number of pairs in array b +* @param node_size The size of arrays hi and lo, in other words how many +* elements to insert into lo before starting insertion into +* hi +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void MergeAndSort(CG const& g, + Pair *a, + Pair *b, + Pair *lo, + Pair *hi, + size_t num_elements_a, + size_t num_elements_b, + size_t node_size, + SharedMemoryLayout shmem) { + + int lane = g.thread_rank(); + int dim = g.size(); + + if (num_elements_a == node_size && + compare(a[node_size - 1], b[0])) { + + CopyPairs(g, lo, a, num_elements_a); + + CopyPairs(g, hi, b, num_elements_b); + return; + } + + if (num_elements_b == node_size && + compare(b[node_size - 1], a[0])) { + + CopyPairs(g, hi, a, num_elements_a); + + CopyPairs(g, lo, b, num_elements_b); + return; + } + + // Array of size 2 * (blockDim.x + 1) + int *intersections = shmem.intersections; + + + if (lane == 0) { + intersections[0] = 0; + intersections[1] = 0; + + intersections[2 * dim] = node_size; + intersections[2 * dim + 1] = node_size; + } + + // Calculate the diagonal spacing + int p = 2 * node_size / dim; + + // There will be one less diagonal than threads + if (threadIdx.x != 0) { + // i + j = (p * threadIdx.x - 1) + int j_bl = min((int)node_size - 1, p * lane - 1); + int i_bl = (p * lane - 1) - j_bl; + + int diag_len = min(p * lane, (int)node_size - i_bl); + + // Will be the location of the rightmost one + // in the merge-path grid in terms of array a + int rightmost_one = i_bl - 1; + + // Location of leftmost zero + int leftmost_zero = i_bl + diag_len; + + // Binary search along the diagonal + while (leftmost_zero - rightmost_one > 1) { + + int i = (rightmost_one + leftmost_zero) / 2; + int j = (p * lane - 1) - i; + + if (i >= num_elements_a) { + leftmost_zero = i; + } else if (j >= num_elements_b || compare(a[i], b[j])) { + rightmost_one = i; + } else { + leftmost_zero = i; + } + + } + + intersections[2 * lane] = leftmost_zero; + intersections[2 * lane + 1] = (p * lane - 1) + - leftmost_zero + 1; + + } + + g.sync(); + + // Get the intersection that starts this partition + int i = intersections[2 * lane]; + int j = intersections[2 * lane + 1]; + + // Get the intersection that ends this partition + int i_max = min(intersections[2 * (lane + 1)], (int)num_elements_a); + int j_max = min(intersections[2 * (lane + 1) + 1], + (int)num_elements_b); + + // Insert location into the output array + int ins_loc = lane * p; + + // Merge our partition into the output arrays + while (i < i_max && j < j_max) { + Pair next_element; + if (compare(a[i], b[j])) { + next_element = a[i]; + i++; + } else { + next_element = b[j]; + j++; + } + if (ins_loc < node_size) { + lo[ins_loc] = next_element; + } else { + hi[ins_loc - node_size] = next_element; + } + ins_loc++; + } + + // Insert the any remaining elements in a + while (i < i_max) { + if (ins_loc < node_size) { + lo[ins_loc] = a[i]; + i++; + } else { + hi[ins_loc - node_size] = a[i]; + i++; + } + ins_loc++; + } + + // Insert any remaining elements in b + while (j < j_max) { + if (ins_loc < node_size) { + lo[ins_loc] = b[j]; + j++; + } else { + hi[ins_loc - node_size] = b[j]; + j++; + } + ins_loc++; + } +} + +/** +* Sorts the len pairs at start by key +* +* @param g The cooperative group that will perform the sort +* @param start Pointer to the array to be sorted +* @param len Number of pairs to be sorted +* @param node_size A power of two corresponding to the number of pairs +* temp can contain +* @param temp A temporary array containing space for at least the nearest +* power of two greater than len pairs +*/ +template +__device__ void PBSort(CG const& g, Pair *start, size_t len, + size_t node_size, + Pair *temp) { + + + int lane = g.thread_rank(); + int dim = g.size(); + + char *mask = (char*)temp; + + for (int i = lane; i < node_size; i += dim) { + mask[i] = i < len; + } + g.sync(); + + // Build a bitonic sequence + for (int width = 2; width < node_size; width *= 2) { + for (int jump = width / 2; jump >= 1; jump /= 2) { + for (int i = lane; i < node_size / 2; i += dim) { + int start_jump = width / 2; + int left = (i / jump) * jump * 2 + i % jump; + int right = left + jump; + if ((i / start_jump) % 2 == 0) { + if (!mask[left] || (mask[right] && + !compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; + start[right] = temp; + + auto temp_mask = mask[left]; + mask[left] = mask[right]; + mask[right] = temp_mask; + } + } else { + if (!mask[right] || (mask[left] + && compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; + start[right] = temp; + + auto temp_mask = mask[left]; + mask[left] = mask[right]; + mask[right] = temp_mask; + } + } + } + g.sync(); + } + } + + // Merge to get the sorted result + for (int jump = node_size / 2; jump >= 1; jump /= 2) { + for (int i = lane; i < node_size / 2; i += dim) { + int left = (i / jump) * jump * 2 + i % jump; + int right = left + jump; + if (!mask[left] || (mask[right] + && !compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; + start[right] = temp; + + auto temp_mask = mask[left]; + mask[left] = mask[right]; + mask[right] = temp_mask; + } + } + g.sync(); + } +} + +/** +* Reverses the bits after the most significant set bit in x +* i.e. if x is 1abc..xyz in binary returns 1zyx...cba +* +* @param x The number whose lower bits will be reversed +* @return The number with all bits after the most significant +* set bit reversed +*/ +__device__ int BitReversePerm(int x) { + int clz = __clz(x); + + int bits = sizeof(int) * 8; + int high_bit = 1 << ((bits - 1) - clz); + int mask = high_bit - 1; + + int masked = x & mask; + int rev = __brev(masked) >> (clz + 1); + + return high_bit | rev; +} + +/** +* Given x, the idx of a node, return when that node is inserted, +* i.e. if x is 6 and lowest_level_start > 6, return 5 since the node +* at element 6 will be the 5th to be inserted with the bit reversal +* permutation. This operation is its own inverse. +* +* @param x The index to operate on +* @param lowest_level_start Index of the first node in the last level of the +* heap +*/ +__device__ int InsertionOrderIndex(int x, int lowest_level_start) { + assert(x > 0); + + if (x >= lowest_level_start) { + return x; + } + + return BitReversePerm(x); +} + +/** +* Find the index of the parent of the node at index x +* +* @param x The index to operate on +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @return The index of the parent of x +*/ +__device__ int Parent(int x, int lowest_level_start) { + + assert(x > 0); + if (x >= lowest_level_start) { + return BitReversePerm(x) / 2; + } + + return x / 2; +} + +/** +* Find the index of the left child of the node at index x +* +* @param x The index to operate on +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @return The index of the left child of x +*/ +__device__ int LeftChild(int x, int lowest_level_start) { + assert(x > 0); + int result = x * 2; + + if (result >= lowest_level_start) { + result = BitReversePerm(result); + } + + return result; +} + +/** +* Find the index of the right child of the node at index x +* +* @param x The index to operate on +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @return The index of the right child of x +*/ +__device__ int RightChild(int x, int lowest_level_start) { + + assert(x > 0); + int result = x * 2 + 1; + + if (result >= lowest_level_start) { + result = BitReversePerm(result); + } + + return result; +} + +/** +* Swim node cur_node up the heap +* Pre: g must hold the lock corresponding to cur_node +* +* @param g The cooperative group that will perform the operation +* @param cur_node Index of the node to swim +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void Swim(CG const& g, + int cur_node, + Pair *heap, + int *size, + size_t node_size, + int *locks, + int lowest_level_start, + SharedMemoryLayout shmem) { + + int lane = g.thread_rank(); + int dim = g.size(); + + int parent = Parent(cur_node, lowest_level_start); + + // Swim the new node up the tree + while (cur_node != 1) { + AcquireLock(g, &(locks[parent])); + + // If the heap property is already satisfied for this node and its + // parent we are done + if (!compare(heap[cur_node * node_size], + heap[parent * node_size + node_size - 1])) { + ReleaseLock(g, &(locks[parent])); + break; + } + + MergeAndSort(g, &heap[parent * node_size], + &heap[cur_node * node_size], + shmem.A, + shmem.B, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[parent * node_size], shmem.A, node_size); + CopyPairs(g, &heap[cur_node * node_size], shmem.B, node_size); + + ReleaseLock(g, &(locks[cur_node])); + cur_node = parent; + parent = Parent(cur_node, lowest_level_start); + } + + ReleaseLock(g, &(locks[cur_node])); + +} + +/** +* Sink the root down the heap +* Pre: g must hold the root's lock +* +* @param g The cooperative group that will perform the operation +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param node_capacity Max capacity of the heap in nodes +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void Sink(CG const& g, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int lowest_level_start, + int node_capacity, + SharedMemoryLayout shmem) { + + size_t cur = kRootIdx; + + int dim = g.size(); + + // Sink the node + while (InsertionOrderIndex(LeftChild(cur, lowest_level_start), + lowest_level_start) <= node_capacity) { + + size_t left = LeftChild(cur, lowest_level_start); + size_t right = RightChild(cur, lowest_level_start); + + AcquireLock(g, &locks[left]); + + // The left node might have been removed + // since the while loop condition, in which + // case we are already at the bottom of the heap + if (InsertionOrderIndex(left, lowest_level_start) > *size) { + ReleaseLock(g, &locks[left]); + break; + } + + size_t lo; + + if (InsertionOrderIndex(right, lowest_level_start) <= node_capacity) { + AcquireLock(g, &locks[right]); + + // Note that even with the bit reversal permutation, + // we can never have a right child without a left child + // + // If we have both children, merge and sort them + if (InsertionOrderIndex(right, lowest_level_start) <= *size) { + + size_t hi; + + // In order to ensure we preserve the heap property, + // we put the largest node_size elements in the child + // that previously contained the largest element + if (!compare(heap[(left+1) * node_size - 1], + heap[(right+1) * node_size - 1])) { + hi = left; + lo = right; + } else { + lo = left; + hi = right; + } + + // Skip the merge and sort if the nodes are already correctly + // sorted + if (!compare(heap[(lo+1) * node_size - 1], + heap[hi * node_size])) { + MergeAndSort(g, &heap[left * node_size], + &heap[right * node_size], + shmem.A, + shmem.B, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[hi * node_size], shmem.B, node_size); + CopyPairs(g, &heap[lo * node_size], shmem.A, node_size); + + g.sync(); + } + ReleaseLock(g, &locks[hi]); + } else { + lo = left; + ReleaseLock(g, &locks[right]); + } + } else { + lo = left; + } + + // If the heap property is already satisfied between the current + // node and the lower child, we are done return + // + // TODO: can this ever even occur? In the paper this is done because + // a max placeholder value is used to indicate unused nodes in the heap + if (!compare(heap[lo * node_size], + heap[(cur + 1) * node_size - 1])) { + ReleaseLock(g, &locks[lo]); + ReleaseLock(g, &locks[cur]); + return; + } + + MergeAndSort(g, &heap[lo * node_size], + &heap[cur * node_size], + shmem.A, + shmem.B, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[lo * node_size], shmem.B, node_size); + CopyPairs(g, &heap[cur * node_size], shmem.A, node_size); + + g.sync(); + + ReleaseLock(g, &locks[cur]); + + cur = lo; + + } + ReleaseLock(g, &locks[cur]); + +} + +/** +* Add exactly node_size elements into the heap from +* elements +* +* @param g The cooperative group that will perform the push +* @param elements The array of elements to add +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void PushSingleNode(CG const& g, + Pair *elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + int lowest_level_start, + SharedMemoryLayout shmem) { + + int lane = g.thread_rank(); + int dim = g.size(); + + CopyPairs(g, shmem.A, elements, node_size); + + g.sync(); + + PBSort(g, shmem.A, node_size, node_size, shmem.B); + + int *cur_node_temp = (int*)shmem.intersections; + if (lane == 0) { + *cur_node_temp = atomicAdd(size, 1) + 1; + } + g.sync(); + + int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); + + AcquireLock(g, &(locks[cur_node])); + + CopyPairs(g, &heap[cur_node * node_size], shmem.A, node_size); + + g.sync(); + + Swim(g, cur_node, heap, size, node_size, locks, + lowest_level_start, shmem); + +} + +/** +* Remove exactly node_size elements from the heap and place them +* in elements +* +* @param elements The array of elements to insert into +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param pop_tracker The pop tracker for this concurrent pop operation +* (see PopKernel) +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param node_capacity Maximum capacity of the heap in nodes +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void PopSingleNode(CG const& g, + Pair *elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int *pop_tracker, + int lowest_level_start, + int node_capacity, + SharedMemoryLayout shmem) { + + int lane = g.thread_rank(); + int dim = g.size(); + + AcquireLock(g, &locks[kRootIdx]); + + // Find the target node (the last one inserted) and + // decrement the size + + size_t tar = InsertionOrderIndex(*size, lowest_level_start); + + if (tar != 1) { + AcquireLock(g, &locks[tar]); + } + + // pop_tracker determines our location in the output array, + // since it tells us how many other nodes have been previously been + // extracted by this block or by other blocks + int out_idx = *pop_tracker; + g.sync(); + + if (lane == 0) { + *size -= 1; + *pop_tracker += 1; + } + g.sync(); + + // Copy the root to the output array + + CopyPairs(g, &elements[out_idx * node_size], &heap[node_size], + node_size); + + g.sync(); + + // Copy the target node to the root + + if (tar != kRootIdx) { + CopyPairs(g, &heap[node_size], &heap[tar * node_size], + node_size); + + ReleaseLock(g, &locks[tar]); + + g.sync(); + } + + // Merge and sort the root and the partial buffer + + MergeAndSort(g, &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[node_size], shmem.A, node_size); + + CopyPairs(g, heap, shmem.B, *p_buffer_size); + + g.sync(); + + Sink(g, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem); + +} + +/** +* Remove num_elements < node_size elements from the heap and place them +* in elements +* +* @param elements The array of elements to insert into +* @param num_elements The number of elements to remove +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param node_capacity Maximum capacity of the heap in nodes +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void PopPartialNode(CG const& g, + Pair *elements, + size_t num_elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int lowest_level_start, + int node_capacity, + SharedMemoryLayout shmem) { + int lane = g.thread_rank(); + int dim = g.size(); + + AcquireLock(g, &locks[kRootIdx]); + + if (*size == 0) { + CopyPairs(g, elements, heap, num_elements); + g.sync(); + + size_t n_p_buffer_size = *p_buffer_size - num_elements; + + if (n_p_buffer_size > 0) { + size_t remaining = n_p_buffer_size; + size_t index = 0; + while (remaining > 0) { + size_t this_round = min(remaining, num_elements); + CopyPairs(g, heap + index, heap + index + num_elements, + this_round); + remaining -= this_round; + index += this_round; + g.sync(); + } + } + + if (lane == 0) { + *p_buffer_size = n_p_buffer_size; + } + ReleaseLock(g, &locks[kRootIdx]); + } else { + + CopyPairs(g, elements, &heap[kRootIdx * node_size], num_elements); + g.sync(); + + if (*p_buffer_size >= num_elements) { + + + MergeAndSort(g, &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.A, + shmem.B, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem); + + if (lane == 0) { + *p_buffer_size = *p_buffer_size - num_elements; + } + + g.sync(); + + CopyPairs(g, &heap[kRootIdx * node_size], shmem.A, node_size); + CopyPairs(g, &heap[kPBufferIdx], shmem.B, *p_buffer_size); + + g.sync(); + + Sink(g, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem); + } else { + + MergeAndSort(g, &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.A, + (Pair*)nullptr, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[kPBufferIdx], shmem.A, + *p_buffer_size + node_size - num_elements); + + int tar = InsertionOrderIndex(*size, lowest_level_start); + g.sync(); + + if (lane == 0) { + *size -= 1; + } + + if (tar != kRootIdx) { + AcquireLock(g, &locks[tar]); + + CopyPairs(g, &heap[kRootIdx * node_size], + &heap[tar * node_size], node_size); + + g.sync(); + + ReleaseLock(g, &locks[tar]); + + MergeAndSort(g, &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem); + + g.sync(); + + CopyPairs(g, &heap[node_size], shmem.A, node_size); + + CopyPairs(g, heap, shmem.B, *p_buffer_size); + + g.sync(); + + Sink(g, heap, size, node_size, locks, + p_buffer_size, lowest_level_start, node_capacity, shmem); + } else { + ReleaseLock(g, &locks[kRootIdx]); + } + } + } +} + +/** +* Add p_ins_size < node_size elements into the heap from +* elements +* +* @param g The cooperative group that will perform the push +* @param elements The array of elements to add +* @param p_ins_size The number of elements to be inserted +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size The size of the partial buffer +* @param lowest_level_start Index of the first node in the last level of the +* heap +* @param shmem The shared memory layout for this cooperative group +*/ +template +__device__ void PushPartialNode(CG const& g, + Pair *elements, + size_t p_ins_size, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int lowest_level_start, + SharedMemoryLayout shmem) { + + int lane = g.thread_rank(); + int dim = g.size(); + + AcquireLock(g, &locks[kRootIdx]); + + CopyPairs(g, shmem.B, elements, p_ins_size); + + PBSort(g, shmem.B, p_ins_size, node_size, shmem.A); + + // There is enough data for a new node, in which case we + // construct a new node and insert it + if (*p_buffer_size + p_ins_size >= node_size) { + + int *cur_node_temp = shmem.intersections; + if (lane == 0) { + *cur_node_temp = atomicAdd(size, 1) + 1; + } + g.sync(); + + int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); + + if (cur_node != kRootIdx) { + AcquireLock(g, &(locks[cur_node])); + } + + g.sync(); + + MergeAndSort(g, shmem.B, + &heap[kPBufferIdx], + &heap[cur_node * node_size], + shmem.A, + p_ins_size, + *p_buffer_size, + node_size, + shmem); + + if (lane == 0) { + *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; + } + + g.sync(); + + CopyPairs(g, heap, shmem.A, *p_buffer_size); + + if (cur_node != kRootIdx) { + ReleaseLock(g, &locks[kRootIdx]); + } + + Swim(g, cur_node, heap, size, node_size, + locks, lowest_level_start, shmem); + + } else { + // There are not enough elements for a new node, + // in which case we merge and sort the root and + // the elements to be inserted and then the root + // and the partial buffer + + MergeAndSort(g, shmem.B, + &heap[kPBufferIdx], + shmem.A, + (Pair *)nullptr, + p_ins_size, + *p_buffer_size, + node_size, + shmem); + + g.sync(); + + if (lane == 0) { + *p_buffer_size += p_ins_size; + } + + g.sync(); + + CopyPairs(g, heap, shmem.A, *p_buffer_size); + + g.sync(); + + if (*size > 0) { + MergeAndSort(g, &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem); + g.sync(); + + CopyPairs(g, heap, shmem.B, *p_buffer_size); + + CopyPairs(g, &heap[node_size], shmem.A, node_size); + } + ReleaseLock(g, &locks[kRootIdx]); + } + +} + +/** +* Add num_elements elements into the heap from +* elements +* @param elements The array of elements to add +* @param num_elements The number of elements to be inserted +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param temp_node A temporary array large enough to store + sizeof(Pair) * node_size bytes +*/ +template +__global__ void PushKernel(Pair *elements, + size_t num_elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int lowest_level_start) { + + extern __shared__ int s[]; + + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, + blockDim.x, node_size); + + // We push as many elements as possible as full nodes, + // then deal with the remaining elements as a partial insertion + // below + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * node_size; + i + node_size <= num_elements; + i += gridDim.x * node_size) { + PushSingleNode(g, elements + i, heap, size, node_size, locks, + lowest_level_start, shmem); + } + + // We only need one block for partial insertion + if (blockIdx.x != 0) { + return; + } + + // If node_size does not divide num_elements, there are some leftover + // elements for which we must perform a partial insertion + size_t first_not_inserted = (num_elements / node_size) + * node_size; + + if (first_not_inserted < num_elements) { + size_t p_ins_size = num_elements - first_not_inserted; + PushPartialNode(g, elements + first_not_inserted, p_ins_size, + heap, size, node_size, locks, p_buffer_size, + lowest_level_start, shmem); + } +} + +/** +* Add num_elements elements into the heap from +* elements, using a warp to handle each node rather than a block +* @param elements The array of elements to add +* @param num_elements The number of elements to be inserted +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param temp_node A temporary array large enough to store + sizeof(Pair) * node_size bytes +*/ +template +__global__ void PushKernelWarp(Pair *elements, + size_t num_elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int lowest_level_start, + int bytes_shmem_per_warp) { + + extern __shared__ char sh[]; + + // We push as many elements as possible as full nodes, + // then deal with the remaining elements as a partial insertion + // below + thread_block block = this_thread_block(); + thread_block_tile<32> warp = tiled_partition<32>(block); + + SharedMemoryLayout shmem = GetSharedMemoryLayout( + (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), + 32, node_size); + + for (size_t i = warp.meta_group_rank() * node_size + + blockIdx.x * node_size * (blockDim.x / 32); + i + node_size <= num_elements; + i += (blockDim.x / 32) * node_size * gridDim.x) { + PushSingleNode(warp, elements + i, heap, size, node_size, locks, + lowest_level_start, shmem); + } + + // We only need one block for partial insertion + if (blockIdx.x != 0 || warp.meta_group_rank() != 0) { + return; + } + + // If node_size does not divide num_elements, there are some leftover + // elements for which we must perform a partial insertion + size_t first_not_inserted = (num_elements / node_size) + * node_size; + + if (first_not_inserted < num_elements) { + size_t p_ins_size = num_elements - first_not_inserted; + PushPartialNode(warp, elements + first_not_inserted, p_ins_size, + heap, size, node_size, locks, p_buffer_size, + lowest_level_start, shmem); + } +} + +/** +* Remove exactly node_size elements from the heap and place them +* in elements, using a warp to handle each node rather than a block +* @param elements The array of elements to insert into +* @param num_elements The number of elements to remove +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param pop_tracker Pointer to an integer in global memory initialized to 0 +*/ +template +__global__ void PopKernelWarp(Pair *elements, + size_t num_elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int *pop_tracker, + int lowest_level_start, + int node_capacity, + int bytes_shmem_per_warp) { + + // We use pop_tracker to ensure that each thread block inserts its node + // at the correct location in the output array + // Since we do not know which block will extract which node + + extern __shared__ char sh[]; + + thread_block block = this_thread_block(); + thread_block_tile<32> warp = tiled_partition<32>(block); + + SharedMemoryLayout shmem = GetSharedMemoryLayout( + (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), + 32, node_size); + + for (size_t i = warp.meta_group_rank() + (blockDim.x / 32) * blockIdx.x; + i < num_elements / node_size; + i += gridDim.x * blockDim.x / 32) { + PopSingleNode(warp, elements, heap, size, node_size, locks, + p_buffer_size, pop_tracker, lowest_level_start, + node_capacity, shmem); + } + + AcquireLock(warp, &locks[kRootIdx]); + // Remove from the partial buffer if there are no nodes + // Only one thread will attempt this deletion because we have acquired + // the root and will increment pop_tracker once we begin the deletion + if (*pop_tracker == num_elements / node_size + && num_elements % node_size != 0) { + + if (warp.thread_rank() == 0) { + *pop_tracker += 1; + } + + size_t p_del_size = num_elements % node_size; + + ReleaseLock(warp, &locks[kRootIdx]); + + PopPartialNode(warp, + elements + (num_elements / node_size) * node_size, + p_del_size, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem); + + } else { + ReleaseLock(warp, &locks[kRootIdx]); + } +} + +/** +* Remove exactly node_size elements from the heap and place them +* in elements +* @param elements The array of elements to insert into +* @param num_elements The number of elements to remove +* @param heap The array of pairs that stores the heap itself +* @param size Pointer to the number of pairs currently in the heap +* @param node_size Size of the nodes in the heap +* @param locks Array of locks, one for each node in the heap +* @param p_buffer_size Number of pairs in the heap's partial buffer +* @param pop_tracker Pointer to an integer in global memory initialized to 0 +*/ +template +__global__ void PopKernel(Pair *elements, + size_t num_elements, + Pair *heap, + int *size, + size_t node_size, + int *locks, + size_t *p_buffer_size, + int *pop_tracker, + int lowest_level_start, + int node_capacity) { + + // We use pop_tracker to ensure that each thread block inserts its node + // at the correct location in the output array + // Since we do not know which block will extract which node + + extern __shared__ int s[]; + + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, + blockDim.x, node_size); + + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { + PopSingleNode(g, elements, heap, size, node_size, locks, + p_buffer_size, pop_tracker, lowest_level_start, + node_capacity, shmem); + } + + AcquireLock(g, &locks[kRootIdx]); + // Remove from the partial buffer if there are no nodes + // Only one thread will attempt this deletion because we have acquired + // the root and will increment pop_tracker once we begin the deletion + if (*pop_tracker == num_elements / node_size + && num_elements % node_size != 0) { + + if (g.thread_rank() == 0) { + *pop_tracker += 1; + } + + size_t p_del_size = num_elements % node_size; + + ReleaseLock(g, &locks[kRootIdx]); + + PopPartialNode(g, elements + (num_elements / node_size) * node_size, + p_del_size, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem); + + } else { + ReleaseLock(g, &locks[kRootIdx]); + } +} +} diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh new file mode 100644 index 000000000..7d0a5536e --- /dev/null +++ b/include/cuco/priority_queue.cuh @@ -0,0 +1,163 @@ +#pragma once + +#include +#include +#include + +namespace cuco { + +template +class priority_queue { + public: + /** + * Construct a priority queue + * @param initial_capacity The number of elements the priority queue can hold + * @param node_size The size of the nodes in the underlying heap data + * structure + */ + priority_queue(size_t initial_capacity, size_t node_size = 1024); + + /** + * Push num_elements elements into the priority queue + * @param elements Array of elements to add to the queue + * @param num_elements Number of elements to add to the queue + * @param block_size Block size to use for the internal kernel launch + * @param grid_size Grid size for the internal kernel launch + * @param warp_size If true, each node is handled by a single warp, otherwise + * by a single block + * @param stream The stream in which the underlying GPU operations will be + * run + */ + void push(Pair *elements, size_t num_elements, + int block_size = 256, int grid_size = 64000, + bool warp_level = false, + cudaStream_t stream = 0); + + /** + * Remove the num_elements elements with the lowest keys from the priority + * queue and place them in out in ascending sorted order by key + * @param out The array in which the removed elements will be placed + * @param num_elements The number of elements to be removed + * @param block_size Block size to use for the internal kernel launch + * @param grid_size Grid size for the internal kernel launch + * @param warp_size If true, each node is handled by a single warp, otherwise + * by a single block + * @param stream The stream in which the underlying GPU operations will be + * run + */ + void pop(Pair *out, size_t num_elements, + int block_size = 512, int grid_size = 32000, + bool warp_level = false, + cudaStream_t stream = 0); + + /* + * Return the amount of shared memory required for operations on the queue + * with a thread block size of block_size + * + * @param block_size Size of the blocks to calculate storage for + */ + int get_shmem_size(int block_size) { + int intersection_bytes = 2 * (block_size + 1) * sizeof(int); + int node_bytes = node_size_ * sizeof(Pair); + return intersection_bytes + 2 * node_bytes; + } + + ~priority_queue(); + + class device_mutable_view { + public: + + /** + * Push a single node or less elements into the priority queue + * + * @param g The cooperative group that will perform the operation + * @param elements Array of elements to add to the queue + * @param num_elements Number of elements to add to the queue + * @param Pointer to a contiguous section of memory large enough + * to hold get_shmem_size(g.size()) bytes + */ + template + __device__ void push(CG const& g, Pair *elements, + size_t num_elements, void *temp_storage); + + /** + * Pop a single node or less elements from the priority queue + * + * @param g The cooperative group that will perform the operation + * @param out Array of elements to put the removed elements in + * @param num_elements Number of elements to remove from the queue + * @param Pointer to a contiguous section of memory large enough + * to hold get_shmem_size(g.size()) bytes + */ + template + __device__ void pop(CG const& g, Pair *out, + size_t num_elements, void *temp_storage); + + __device__ size_t get_node_size() { + return node_size_; + } + + /* + * Return the amount of temporary storage required for operations + * on the queue with a cooperative group size of block_size + * + * @param block_size Size of the cooperative groups to calculate storage for + */ + __device__ int get_shmem_size(int block_size) { + int intersection_bytes = 2 * (block_size + 1) * sizeof(int); + int node_bytes = node_size_ * sizeof(Pair); + return intersection_bytes + 2 * node_bytes; + } + + __host__ __device__ device_mutable_view(size_t node_size, + Pair *d_heap, + int *d_size, + size_t *d_p_buffer_size, + int *d_locks, + int lowest_level_start, + int node_capacity) + : node_size_(node_size), + d_heap_(d_heap), + d_size_(d_size), + d_p_buffer_size_(d_p_buffer_size), + d_locks_(d_locks), + lowest_level_start_(lowest_level_start), + node_capacity_(node_capacity) + { + } + + private: + size_t node_size_; + int lowest_level_start_; + int node_capacity_; + + Pair *d_heap_; + int *d_size_; + size_t *d_p_buffer_size_; + int *d_locks_; + }; + + /* + * Return a class that can be used to perform insertion and deletion + * of single nodes in device code with cooperative groups + */ + device_mutable_view get_mutable_device_view() { + return device_mutable_view(node_size_, d_heap_, d_size_, d_p_buffer_size_, + d_locks_, lowest_level_start_, node_capacity_); + } + + private: + size_t node_size_; + int lowest_level_start_; + int node_capacity_; + + Pair *d_heap_; + int *d_size_; + size_t *d_p_buffer_size_; + int *d_locks_; + int *d_pop_tracker_; +}; + +} + +#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 40bd2b30a..b54b38d00 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -45,3 +45,7 @@ set(DYNAMIC_MAP_TEST_SRC ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") #################################################################################################### +# TODO: Port priority_queue tests to Catch2 +add_executable(PRIORITY_QUEUE_TEST + "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_test.cu") +target_link_libraries(PRIORITY_QUEUE_TEST cuco) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu new file mode 100644 index 000000000..c982f46d5 --- /dev/null +++ b/tests/priority_queue/priority_queue_test.cu @@ -0,0 +1,1185 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +using namespace cooperative_groups; +using namespace cuco; + +// Inserts elements into pq, managing memory allocation +// and copying to the device +template +void Insert(priority_queue &pq, + const std::vector> &elements, + bool warp_level = false) { + Pair *d_elements; + + size_t num_bytes = sizeof(Pair) * elements.size(); + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); + + CUCO_CUDA_TRY(cudaMemcpy(d_elements, &elements[0], num_bytes, + cudaMemcpyHostToDevice)); + + pq.push(d_elements, elements.size(), 512, 32000, warp_level); + + CUCO_CUDA_TRY(cudaFree(d_elements)); +} + +// Deletes num_elements elements from pq and returns them, +// managing device memory +template +std::vector> Delete(priority_queue &pq, + size_t num_elements, + bool warp_level = false) { + Pair *d_elements; + + size_t num_bytes = sizeof(Pair) * num_elements; + + CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); + + pq.pop(d_elements, num_elements, 512, 32, warp_level); + + std::vector> result(num_elements); + + CUCO_CUDA_TRY(cudaMemcpy(&result[0], d_elements, num_bytes, + cudaMemcpyDeviceToHost)); + + CUCO_CUDA_TRY(cudaFree(d_elements)); + + return result; +} + +template +__global__ void DeviceAPIInsert( + typename priority_queue::device_mutable_view view, + Pair *elements, + size_t num_elements) { + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * view.get_node_size(); + i < num_elements; i += gridDim.x * view.get_node_size()) { + view.push(g, elements + i, min(view.get_node_size(), num_elements - i), + shmem); + } +} + +template +__global__ void DeviceAPIDelete( + typename priority_queue::device_mutable_view view, + Pair *out, + size_t num_elements) { + + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * view.get_node_size(); + i < num_elements; i += gridDim.x * view.get_node_size()) { + view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + } +} + +template +__global__ void DeviceAPIInsertWarp( + typename priority_queue::device_mutable_view view, + Pair *elements, + size_t num_elements) { + extern __shared__ int shmem[]; + const int kWarpSize = 32; + thread_block g = this_thread_block(); + thread_block_tile warp = tiled_partition(g); + for (size_t i = blockIdx.x * view.get_node_size() * (blockDim.x / kWarpSize) + + warp.meta_group_rank() * view.get_node_size(); + i < num_elements; + i += gridDim.x * view.get_node_size() * blockDim.x / kWarpSize) { + view.push(warp, elements + i, min(view.get_node_size(), + num_elements - i), (char*)shmem + warp.meta_group_rank() + * view.get_shmem_size(kWarpSize)); + } +} + +template +__global__ void DeviceAPIDeleteWarp( + typename priority_queue::device_mutable_view view, + Pair *out, + size_t num_elements) { + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * view.get_node_size(); + i < num_elements; i += gridDim.x * view.get_node_size()) { + view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + } +} +// Each test case is composed of a name +// and a function that returns true when the test +// passes and false when it fails +struct TestCase { + std::string name; + bool (*func)(); +}; + +using IntIntVector = std::vector>; + +using IntLongVector = std::vector>; + +using FloatIntVector = std::vector>; + +TestCase cases[] = { + + {"test_insert_1", []() { + priority_queue pq(1000); + IntIntVector result = {{1, 1}}; + Insert(pq, {{1, 1}}); + return Delete(pq, 1) == result; + } + }, + + {"test_insert_descending_seq", []() { + const int kNodeSize = 1024; + + srand(0); + + // Choose some reasonably large number of elements + int count = rand() % 1000000 + 10000; + + priority_queue pq(count, kNodeSize); + + IntIntVector input; + + for (int i = count - 1; i >= 0; i--) { + input.push_back({i, i}); + } + + IntIntVector result; + + for (int i = 0; i < count; i++) { + result.push_back({i, i}); + } + + for (auto e : input) { + Insert(pq, {e}); + } + + return Delete(pq, count) == result; + } + }, + + {"test_delete_from_p_buffer", []() { + const int kNodeSize = 1024; + + // Choose some number of elements less than the node size + int count = rand() % kNodeSize; + + priority_queue pq(count, kNodeSize); + + IntIntVector input; + + for (int i = count - 1; i >= 0; i--) { + input.push_back({i, i}); + } + + IntIntVector result; + + for (int i = 0; i < count; i++) { + result.push_back({i, i}); + } + + for (auto e : input) { + Insert(pq, {e}); + } + + bool pass = true; + for (int i = 0; i < count; i++) { + auto next_el = Delete(pq, 1)[0]; + bool next = next_el == result[i]; + if (!pass || !next) { + std::cout << "i=" << i << ": expected " << result[i].key + << " " << result[i].value << " got " << next_el.key + << " " << next_el.value << std::endl; + } + pass = pass && next; + } + + return pass; + } + }, + + {"test_partial_insert_new_node", []() { + const int kNodeSize = 1024; + + // We choose count = 600 so that two partial insertions + // of size count will cause a new node to be created + // (600 + 600) = 1200 > 1024 so not all elements can fit in the partial + // buffer + int count = 600; + + priority_queue pq(count * 2, kNodeSize); + + IntIntVector input; + IntIntVector input2; + for (int i = 0; i < count; i++) { + input.push_back({i, i}); + input2.push_back({i + count, i + count}); + } + + Insert(pq, input); + Insert(pq, input2); + + auto delete1 = Delete(pq, kNodeSize); + for (int i = 0; i < kNodeSize; i++) { + if (delete1[i].key != i) { + std::cout << "Error at i = " << i + kNodeSize << std::endl; + return false; + } + } + + auto delete2 = Delete(pq, count * 2 - kNodeSize); + for (int i = 0; i < count * 2 - kNodeSize; i++) { + if (delete2[i].key != i + kNodeSize) { + std::cout << "Error at i = " << i + kNodeSize << std::endl; + return false; + } + } + + return true; + } + }, + + {"test_insert_descending_bulk", []() { + const int kNodeSize = 1024; + + srand(0); + + // Choose some reasonably large number of keys, + // less than node size to test partial insertion of + // individual elements + int count = rand() % kNodeSize; + + priority_queue pq(count, kNodeSize); + + IntIntVector input; + + for (int i = count - 1; i >= 0; i--) { + input.push_back({i, i}); + } + + IntIntVector result; + + for (int i = 0; i < count; i++) { + result.push_back({i, i}); + } + + Insert(pq, input); + + return Delete(pq, count) == result; + } + }, + + {"test_insert_random_seq", []() { + const int kNodeSize = 1024; + srand(0); + + // Choose some reasonably large number of keys + int count = rand() % 1000000 + 10000; + + priority_queue pq(count, kNodeSize); + + IntIntVector input; + + for (int i = 0; i < count; i++) { + input.push_back({rand(), i}); + } + + IntIntVector result = input; + + std::sort(result.begin(), result.end(), + [](const Pair &a, + const Pair &b) { + return a.key < b.key; + } + ); + + for (auto e : input) { + Insert(pq, {e}); + } + + auto output = Delete(pq, count); + bool pass = true; + for (int i = 0; i < count; i++) { + if (output[i].key != result[i].key) { + std::cout << "Expected " << result[i].key << " " << result[i].value + << " got " << output[i].key << " " << output[i].value + << std::endl; + pass = false; + } + } + return pass; + } + }, + + {"test_insert_random_bulk", []() { + const int kNodeSize = 1024; + srand(0); + + // Choose some reasonably large number of keys, + // A multiple of node size so that this test only + // tests full node insertion + int count = rand() % kNodeSize * 100 + 10 * kNodeSize; + + priority_queue pq(count, kNodeSize); + + IntIntVector input; + + for (int i = 0; i < count; i++) { + input.push_back({rand(), i}); + } + + IntIntVector result = input; + + std::sort(result.begin(), result.end(), + [](const Pair &a, + const Pair &b) { + return a.key < b.key; + } + ); + + Insert(pq, input); + + auto output = Delete(pq, count); + bool pass = true; + for (int i = 0; i < count; i++) { + if (output[i].key != result[i].key) { + std::cout << "Expected " << result[i].key << " " << result[i].value + << " got " << output[i].key << " " << output[i].value + << std::endl; + pass = false; + } + } + return pass; + } + }, + + {"test_insert_descending_bulk_2", []() { + srand(0); + + const int kNodeSize = 1024; + + // Choose some reasonably large number of nodes + const int kNodes = rand() % 1000 + 50; + + priority_queue pq(kNodeSize * kNodes, kNodeSize); + + for (int i = kNodes - 1; i >= 0; i--) { + + IntIntVector input; + for (int j = kNodeSize - 1; j >= 0; j--) { + input.push_back({i * kNodeSize + j, 1}); + } + Insert(pq, input); + } + + IntIntVector deletion = Delete(pq, kNodeSize); + + bool result = true; + + for (int i = 0; i < kNodeSize; i++) { + result = result && (deletion[i].key == i); + } + + deletion = Delete(pq, kNodeSize * (kNodes - 1)); + + for (int i = kNodeSize; i < kNodes * kNodeSize; i++) { + result = result && (deletion[i - kNodeSize].key == i); + } + + return result; + } + }, + + {"test_insert_shuffled_bulk_2", []() { + srand(0); + const int kNodeSize = 1024; + // Choose some reasonably large number of nodes + const int kNodes = rand() % 1000 + 50; + priority_queue pq(kNodeSize * kNodes, kNodeSize); + + for (int i = kNodes - 1; i >= 0; i--) { + + IntIntVector input(kNodeSize); + for (int j = kNodeSize - 1; j >= 0; j--) { + // Shuffle each input vector by putting even numbers + // in the first half and odd numbers in the second half + if (j % 2 == 0) { + input[j / 2] = {i * kNodeSize + j, 1}; + } else { + input[kNodeSize / 2 + (j / 2)] = {i * kNodeSize + j, 1}; + } + } + Insert(pq, input); + } + + IntIntVector deletion = Delete(pq, kNodeSize); + + bool result = true; + + for (int i = 0; i < kNodeSize; i++) { + result = result && (deletion[i].key == i); + } + + deletion = Delete(pq, kNodeSize * (kNodes - 1)); + + for (int i = kNodeSize; i < kNodes * kNodeSize; i++) { + result = result && (deletion[i - kNodeSize].key == i); + } + + return result; + } + }, + + {"test_insert_random_seq_long_val", []() { + srand(0); + + // Choose some reasonably large number of keys + int count = rand() % 100000 + 10000; + + priority_queue pq(count); + + IntLongVector input; + + for (int i = 0; i < count; i++) { + input.push_back({rand(), i}); + } + + IntLongVector result = input; + + std::sort(result.begin(), result.end(), + [](const Pair &a, + const Pair &b) { + return a.key < b.key; + } + ); + + for (auto e : input) { + Insert(pq, {e}); + } + + auto output = Delete(pq, count); + bool pass = true; + for (int i = 0; i < count; i++) { + if (output[i].key != result[i].key) { + std::cout << "Expected " << result[i].key << " " << result[i].value + << " got " << output[i].key << " " << output[i].value + << std::endl; + pass = false; + } + } + return pass; + } + }, + + {"test_insert_random_seq_float", []() { + srand(0); + + // Choose some reasonably large number of keys + int count = rand() % 100000 + 10000; + + priority_queue pq(count); + + FloatIntVector input; + + for (int i = 0; i < count; i++) { + input.push_back({(float)rand() / RAND_MAX, i}); + } + + FloatIntVector result = input; + + std::sort(result.begin(), result.end(), + [](const Pair &a, + const Pair &b) { + return a.key < b.key; + } + ); + + for (auto e : input) { + Insert(pq, {e}); + } + + auto output = Delete(pq, count); + bool pass = true; + for (int i = 0; i < count; i++) { + if (output[i].key != result[i].key) { + std::cout << "Expected " << result[i].key << " " << result[i].value + << " got " << output[i].key << " " << output[i].value + << std::endl; + pass = false; + } + } + return pass; + } + }, + + {"test_insert_all_same_key", []() { + srand(0); + // Choose some reasonably large number of keys + int count = rand() % 100000 + 10000; + + priority_queue pq(count); + + IntIntVector input(count); + for (int i = 0; i < count; i++) { + input[i] = {1, i}; + } + + Insert(pq, input); + + IntIntVector result = Delete(pq, count); + + // Check if all the values were retained + std::vector values(count, false); + + for (auto r : result) { + values[r.value] = true; + } + + bool pass = true; + for (bool b : values) { + pass = pass && b; + } + + return pass; + } + }, + + {"test_insert_negatives_and_limits", []() { + + srand(0); + + // Choose some reasonably large number of keys + int count = rand() % 100000 + 10000; + + priority_queue pq(count); + + // Create some elements with negative and very large + // and very small keys + IntIntVector elements = {{INT32_MAX, 1}, {-100, 1}, {100, 1}, {0, 1}, + {INT32_MIN, 1}, {-1000000, 1}}; + + IntIntVector input; + + for (int i = 0; i < count; i++) { + input.push_back(elements[rand() % elements.size()]); + } + + IntIntVector result = input; + + std::sort(result.begin(), result.end(), + [](const Pair &a, + const Pair &b) { + return a.key < b.key; + } + ); + + Insert(pq, input); + + auto output = Delete(pq, count); + bool pass = true; + for (int i = 0; i < count; i++) { + if (output[i].key != result[i].key) { + std::cout << "Expected " << result[i].key << " " << result[i].value + << " got " << output[i].key << " " << output[i].value + << std::endl; + pass = false; + } + } + return pass; + } + }, + + {"test_insert_2000_keys", []() { + int num_keys = 2000; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_10M_keys", []() { + int num_keys = 10e6; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_10M_keys_warp_level", []() { + int num_keys = 10e6; + const int kNodeSize = 32; + + srand(0); + + priority_queue pq(num_keys, kNodeSize); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys, true); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_10M_keys_max", []() { + int num_keys = 10e6; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + + std::sort(std_vec.begin(), std_vec.end(), std::greater()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_10M_keys_unbatched", []() { + int num_keys = 10e6; + const int kNodeSize = 1024; + + srand(0); + + priority_queue pq(num_keys, kNodeSize); + + std::vector std_vec; + + for (int j = 0; j < num_keys; j += kNodeSize) { + IntIntVector input(min(num_keys - j, kNodeSize)); + for (size_t i = 0; i < input.size(); i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + Insert(pq, input); + } + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1024e4_keys", []() { + int num_keys = 1024e4; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1024", []() { + int node_size = 1024; + int num_keys = node_size * 2; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + for (int j = 0; j < num_keys / node_size; j++) { + IntIntVector input(node_size); + for (int i = 0; i < node_size; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + Insert(pq, input); + } + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_partial_deletion_1", []() { + int node_size = 1024; + int num_nodes_before = 1; + int num_nodes_after = 1; + int num_keys = node_size * num_nodes_before + + node_size * num_nodes_after + 1; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + IntIntVector input; + IntIntVector result_vec; + + for (int i = 0; i < num_nodes_before * node_size; i++) { + int32_t next = rand(); + std_vec.push_back(next); + input.push_back({next, i}); + } + + int32_t partial = rand(); + std_vec.push_back(partial); + input.push_back({partial, 1}); + + for (int i = 0; i < num_nodes_after * node_size; i++) { + int32_t next = rand(); + std_vec.push_back(next); + input.push_back({next, i}); + } + + Insert(pq, input); + + for (auto i : Delete(pq, node_size * num_nodes_before)) { + result_vec.push_back(i); + } + + result_vec.push_back(Delete(pq, 1)[0]); + + for (auto i : Delete(pq, node_size * num_nodes_after)) { + result_vec.push_back(i); + } + + std::sort(std_vec.begin(), std_vec.end()); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + //std::sort(std_vec.begin(), std_vec.end()); + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1024e4_keys_device_API", []() { + int num_keys = 1024e4; + + srand(0); + + priority_queue pq(num_keys); + + IntIntVector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, i}; + std_vec.push_back({next, i}); + } + + Pair *elements; + cudaMalloc(&elements, sizeof(Pair) * num_keys); + + cudaMemcpy(elements, &std_vec[0], + sizeof(Pair) * num_keys, + cudaMemcpyHostToDevice); + + const int kBlockSize = 512; + const int kNumBlocks = 512; + + std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { + return a.key < b.key; + }); + + DeviceAPIInsert<<>> + (pq.get_mutable_device_view(), elements, num_keys); + + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> + (pq.get_mutable_device_view(), elements, num_keys); + + IntIntVector result_vec(num_keys); + + cudaMemcpy(&result_vec[0], elements, + sizeof(Pair) * num_keys, + cudaMemcpyDeviceToHost); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i].key; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i].key + << " " << std_vec[i].value + << " got " + << result_vec[i].key << " " << result_vec[i].value + << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1000e4_keys_device_API_warp", []() { + int num_keys = 1000e4 + 1; + const int kNodeSize = 64; + + srand(0); + + priority_queue pq(num_keys, kNodeSize); + + IntIntVector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, i}; + std_vec.push_back({next, i}); + } + + Pair *elements; + cudaMalloc(&elements, sizeof(Pair) * num_keys); + + cudaMemcpy(elements, &std_vec[0], + sizeof(Pair) * num_keys, + cudaMemcpyHostToDevice); + + const int kBlockSize = 512; + const int kNumBlocks = 512; + + std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { + return a.key < b.key; + }); + + DeviceAPIInsertWarp<<>> + (pq.get_mutable_device_view(), elements, num_keys); + + DeviceAPIDeleteWarp<<<1, 32, pq.get_shmem_size(32)>>> + (pq.get_mutable_device_view(), elements, num_keys); + + IntIntVector result_vec(num_keys); + + cudaMemcpy(&result_vec[0], elements, + sizeof(Pair) * num_keys, + cudaMemcpyDeviceToHost); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i].key; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i].key + << " " << std_vec[i].value + << " got " + << result_vec[i].key << " " << + result_vec[i].value << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1000e4_keys_device_API", []() { + int num_keys = 1000e4 + 1; + + srand(0); + + priority_queue pq(num_keys); + + IntIntVector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, i}; + std_vec.push_back({next, i}); + } + + Pair *elements; + cudaMalloc(&elements, sizeof(Pair) * num_keys); + + cudaMemcpy(elements, &std_vec[0], + sizeof(Pair) * num_keys, + cudaMemcpyHostToDevice); + + const int kBlockSize = 512; + const int kNumBlocks = 512; + + std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { + return a.key < b.key; + }); + + DeviceAPIInsert<<>> + (pq.get_mutable_device_view(), elements, num_keys); + + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> + (pq.get_mutable_device_view(), elements, num_keys); + + IntIntVector result_vec(num_keys); + + cudaMemcpy(&result_vec[0], elements, + sizeof(Pair) * num_keys, + cudaMemcpyDeviceToHost); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i].key; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i].key + << " " << std_vec[i].value + << " got " + << result_vec[i].key << " " << + result_vec[i].value << std::endl; + } + result = result && next; + } + + return result; + } + }, + + {"test_insert_1024e4_keys_device_API_warp", []() { + int num_keys = 1024e4; + const int kNodeSize = 64; + + srand(0); + + priority_queue pq(num_keys, kNodeSize); + + IntIntVector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, i}; + std_vec.push_back({next, i}); + } + + Pair *elements; + cudaMalloc(&elements, sizeof(Pair) * num_keys); + + cudaMemcpy(elements, &std_vec[0], + sizeof(Pair) * num_keys, + cudaMemcpyHostToDevice); + + const int kBlockSize = 512; + const int kNumBlocks = 512; + + std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { + return a.key < b.key; + }); + + DeviceAPIInsertWarp<<>> + (pq.get_mutable_device_view(), elements, num_keys); + + DeviceAPIDeleteWarp<<<1, 32, pq.get_shmem_size(32)>>> + (pq.get_mutable_device_view(), elements, num_keys); + + IntIntVector result_vec(num_keys); + + cudaMemcpy(&result_vec[0], elements, + sizeof(Pair) * num_keys, + cudaMemcpyDeviceToHost); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i].key; + if (result && !next) { + std::cout << i << ": expected " << std_vec[i].key + << " " << std_vec[i].value + << " got " + << result_vec[i].key << " " << + result_vec[i].value << std::endl; + } + result = result && next; + } + + return result; + } + }, +}; + +int main() { + + int failures = 0; + + for (auto c : cases) { + std::cout << c.name << "....."; + if (c.func()) { + std::cout << "PASS" << std::endl; + } else { + std::cout << "FAIL" << std::endl; + failures++; + } + } + + std::cout << "Failures: " << failures << std::endl; + + return 0; +} From 1f2092cf62b962283ebc2b8e0a7fc57f9cf75e30 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 9 Sep 2021 15:37:40 -0700 Subject: [PATCH 02/55] Add priority queue benchmark --- benchmarks/CMakeLists.txt | 4 + .../priority_queue/priority_queue_bench.cu | 187 ++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 benchmarks/priority_queue/priority_queue_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 57c0c337d..70e7084ba 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -50,3 +50,7 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") ################################################################################################### set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") + +#TODO: Port priority_queue benchmark to google benchmark +add_executable(PRIORITY_QUEUE_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu") +target_link_libraries(PRIORITY_QUEUE_BENCH cuco) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu new file mode 100644 index 000000000..66cddf96c --- /dev/null +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -0,0 +1,187 @@ +#include +#include +#include +#include + +#include + +#include +#include + +using namespace cuco; + +template +__global__ void DeviceAPIInsert( + typename priority_queue::device_mutable_view view, + Pair *elements, + size_t num_elements) { + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * view.get_node_size(); + i < num_elements; i += gridDim.x * view.get_node_size()) { + view.push(g, elements + i, min(view.get_node_size(), num_elements - i), + shmem); + } +} + +template +__global__ void DeviceAPIDelete( + typename priority_queue::device_mutable_view view, + Pair *out, + size_t num_elements) { + + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + for (size_t i = blockIdx.x * view.get_node_size(); + i < num_elements; i += gridDim.x * view.get_node_size()) { + view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + } +} + +// Use CUDA events to time the code in the lambda function +template +float TimeCode(F f) { + cudaEvent_t t1; + CUCO_CUDA_TRY(cudaEventCreate(&t1)); + + cudaEvent_t t2; + CUCO_CUDA_TRY(cudaEventCreate(&t2)); + + CUCO_CUDA_TRY(cudaEventRecord(t1)); + f(); + CUCO_CUDA_TRY(cudaEventRecord(t2)); + + CUCO_CUDA_TRY(cudaEventSynchronize(t1)); + CUCO_CUDA_TRY(cudaEventSynchronize(t2)); + + float result; + CUCO_CUDA_TRY(cudaEventElapsedTime(&result, t1, t2)); + return result; +} + +// Time the insertion of the num_keys elements at d_elements into pq in ms +float TimeInsert(priority_queue &pq, + Pair *d_elements, + size_t num_keys) { + return TimeCode([&]() { + pq.push(d_elements, num_keys); + }); +} + +// Time insert of the num_keys elements with the device API at d_elements +// into pq in ms +float TimeInsertDeviceAPI(priority_queue &pq, + Pair *d_elements, + size_t num_keys) { + return TimeCode([&]() { + DeviceAPIInsert<<<64000, 256, pq.get_shmem_size(256)>>> + (pq.get_mutable_device_view(), d_elements, num_keys); + }); +} + +// Time the deletion of num_keys elements from pq in ms +float TimeDeleteDeviceAPI(priority_queue &pq, + Pair *d_elements, + size_t num_keys) { + return TimeCode([&]() { + DeviceAPIDelete<<<32000, 512, pq.get_shmem_size(512)>>> + (pq.get_mutable_device_view(), d_elements, num_keys); + }); +} + +// Time the deletion of num_keys elements from pq in ms +float TimeDelete(priority_queue &pq, + Pair *d_elements, + size_t num_keys) { + return TimeCode([&]() { + pq.pop(d_elements, num_keys); + }); +} + +// Follow the first experiment in the paper, +// inserting 512 million 4-byte keys and then deleting them all +// Repeat in ascending, descending and random key order +void InsertThenDelete() { + + std::cout << "==Insert then delete==" << std::endl; + + size_t num_keys = 512e6; + + std::cout << num_keys << " keys" << std::endl; + + std::cout << "Order\t\tInsertion (ms)\t\tDeletion (ms)" << std::endl; + + // Allocate GPU memory to store the keys that will be inserted + Pair *d_elements; + size_t num_bytes = num_keys * sizeof(Pair); + CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); + + priority_queue pq(num_keys); + + // Ascending + std::vector> ascending(num_keys); + + for (uint32_t i = 0; i < num_keys; i++) { + ascending[i] = {i, i}; + } + + CUCO_CUDA_TRY(cudaMemcpy(d_elements, &ascending[0], + num_bytes, cudaMemcpyHostToDevice)); + + auto time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); + auto time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); + + std::cout << "Ascend\t\t" << time_elapsed_insert << "\t\t" + << time_elapsed_delete << std::endl; + + // Descending + std::vector> descending(num_keys); + + for (uint32_t i = 0; i < num_keys; i++) { + descending[num_keys - i - 1] = {i, i}; + } + + CUCO_CUDA_TRY(cudaMemcpy(d_elements, &descending[0], + num_bytes, cudaMemcpyHostToDevice)); + + time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); + time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); + + std::cout << "Descend\t\t" << time_elapsed_insert << "\t\t" + << time_elapsed_delete << std::endl; + + // Random + std::vector> random(num_keys); + + for (uint32_t i = 0; i < num_keys; i++) { + random[i] = {(uint32_t)rand(), i}; + } + + CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0], + num_bytes, cudaMemcpyHostToDevice)); + + time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); + time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); + + std::cout << "Random\t\t" << time_elapsed_insert << "\t\t" + << time_elapsed_delete << std::endl; + + CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0], + num_bytes, cudaMemcpyHostToDevice)); + + time_elapsed_insert = TimeInsertDeviceAPI(pq, d_elements, num_keys); + time_elapsed_delete = TimeDeleteDeviceAPI(pq, d_elements, num_keys); + + std::cout << "Random Dev. API\t\t" << time_elapsed_insert << "\t\t" + << time_elapsed_delete << std::endl; + + CUCO_CUDA_TRY(cudaFree(d_elements)); +} + + +int main() { + + InsertThenDelete(); + + return 0; +} From 6a9dc998417e3d245bad7d412b4bd2903331d24e Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 9 Sep 2021 16:17:41 -0700 Subject: [PATCH 03/55] Class comment --- include/cuco/priority_queue.cuh | 40 +++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 7d0a5536e..550c73f0e 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -6,8 +6,48 @@ namespace cuco { +/* +* @brief A GPU-accelerated priority queue of key-value pairs +* +* Allows for multiple concurrent insertions as well as multiple concurrent +* deletions +* +* Current limitations: +* - Only supports trivially comparable key types +* - Does not support insertion and deletion at the same time +* - The implementation of the priority queue is based on +* https://arxiv.org/pdf/1906.06504.pdf, which provides a way to allow +* concurrent insertion and deletion, so this could be added later if useful +* - Capacity is fixed and the queue does not automatically resize +* - Deletion from the queue is much slower than insertion into the queue +* due to congestion at the underlying heap's root node +* +* The queue supports two operations: +* `push`: Add elements into the queue +* `pop`: Remove the element(s) with the lowest (when Max == false) or highest +* (when Max == true) keys +* +* The priority queue supports bulk host-side operations and more fine-grained +* device-side operations. +* +* The host-side bulk operations `push` and `pop` allow an arbitrary number of +* elements to be pushed to or popped from the queue. +* +* The device-side operations allow a cooperative group to push or pop +* some number of elements less than or equal to node_size. These device side +* operations are invoked with a trivially-copyable device view, +* `device_mutable_view` which can be obtained with the host function +* `get_mutable_device_view` and passed to the device. +* +* @tparam Key Trivially comparable type used for keys +* @tparam Value Type of the value to be stored +* @tparam Max When false, pop operations yield the elements with the smallest +* keys in the queue, otherwise, pop operations yeild the elements +* with the largest keys +*/ template class priority_queue { + public: /** * Construct a priority queue From 6b263e3e906cddbf02de7c41a4793ce382a65587 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 9 Sep 2021 16:40:33 -0700 Subject: [PATCH 04/55] Improve comments and switch to cuco style --- include/cuco/priority_queue.cuh | 66 ++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 18 deletions(-) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 550c73f0e..3bf0fa1d3 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -50,7 +50,8 @@ class priority_queue { public: /** - * Construct a priority queue + * @brief Construct a priority queue + * * @param initial_capacity The number of elements the priority queue can hold * @param node_size The size of the nodes in the underlying heap data * structure @@ -58,7 +59,8 @@ class priority_queue { priority_queue(size_t initial_capacity, size_t node_size = 1024); /** - * Push num_elements elements into the priority queue + * @brief Push num_elements elements into the priority queue + * * @param elements Array of elements to add to the queue * @param num_elements Number of elements to add to the queue * @param block_size Block size to use for the internal kernel launch @@ -74,8 +76,9 @@ class priority_queue { cudaStream_t stream = 0); /** - * Remove the num_elements elements with the lowest keys from the priority + * @brief Remove the num_elements elements with the lowest keys from the priority * queue and place them in out in ascending sorted order by key + * * @param out The array in which the removed elements will be placed * @param num_elements The number of elements to be removed * @param block_size Block size to use for the internal kernel launch @@ -91,10 +94,11 @@ class priority_queue { cudaStream_t stream = 0); /* - * Return the amount of shared memory required for operations on the queue + * @brief Return the amount of shared memory required for operations on the queue * with a thread block size of block_size * * @param block_size Size of the blocks to calculate storage for + * @return The amount of temporary storage required in bytes */ int get_shmem_size(int block_size) { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); @@ -102,14 +106,18 @@ class priority_queue { return intersection_bytes + 2 * node_bytes; } + /** + * @brief Destroys the queue and frees its contents + */ ~priority_queue(); class device_mutable_view { public: /** - * Push a single node or less elements into the priority queue + * @brief Push a single node or less elements into the priority queue * + * @tparam CG Cooperative Group type * @param g The cooperative group that will perform the operation * @param elements Array of elements to add to the queue * @param num_elements Number of elements to add to the queue @@ -121,8 +129,9 @@ class priority_queue { size_t num_elements, void *temp_storage); /** - * Pop a single node or less elements from the priority queue + * @brief Pop a single node or less elements from the priority queue * + * @tparam CG Cooperative Group type * @param g The cooperative group that will perform the operation * @param out Array of elements to put the removed elements in * @param num_elements Number of elements to remove from the queue @@ -133,15 +142,24 @@ class priority_queue { __device__ void pop(CG const& g, Pair *out, size_t num_elements, void *temp_storage); + /** + * @brief Returns the node size of the queue's underlying heap + * representation, i.e. the maximum number of elements + * pushable or poppable with a call to the device push + * and pop functions + * + * @return The underlying node size + */ __device__ size_t get_node_size() { return node_size_; } /* - * Return the amount of temporary storage required for operations + * @brief Return the amount of temporary storage required for operations * on the queue with a cooperative group size of block_size * * @param block_size Size of the cooperative groups to calculate storage for + * @return The amount of temporary storage required in bytes */ __device__ int get_shmem_size(int block_size) { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); @@ -178,8 +196,11 @@ class priority_queue { }; /* - * Return a class that can be used to perform insertion and deletion - * of single nodes in device code with cooperative groups + * @brief Returns a trivailly-copyable class that can be used to perform + * insertion and deletion of single nodes in device code with + * cooperative groups + * + * @return A device view */ device_mutable_view get_mutable_device_view() { return device_mutable_view(node_size_, d_heap_, d_size_, d_p_buffer_size_, @@ -187,15 +208,24 @@ class priority_queue { } private: - size_t node_size_; - int lowest_level_start_; - int node_capacity_; - - Pair *d_heap_; - int *d_size_; - size_t *d_p_buffer_size_; - int *d_locks_; - int *d_pop_tracker_; + size_t node_size_; ///< Size of the heap's nodes + int lowest_level_start_; ///< Index in `d_heap_` of the first node in the + /// heap's lowest level + int node_capacity_; ///< Capacity of the heap in nodes + + Pair *d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where the + /// 1st node is the root + int *d_size_; ///< Number of nodes currently in the heap + size_t *d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int *d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// 1d_heap_[node_size * i]` + int *d_pop_tracker_; ///< Variable used to track where in its output + /// array a pop operation should place a given + /// popped node }; } From 0eaaedf77c50dd04a6271a690fae69c2727a3b50 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 16 Sep 2021 20:51:38 -0700 Subject: [PATCH 05/55] Iterators --- .../priority_queue/priority_queue_bench.cu | 10 ++- include/cuco/detail/priority_queue.inl | 44 +++++----- .../cuco/detail/priority_queue_kernels.cuh | 82 ++++++++++++------- include/cuco/priority_queue.cuh | 46 +++++++---- tests/priority_queue/priority_queue_test.cu | 15 ++-- 5 files changed, 119 insertions(+), 78 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 66cddf96c..9d85b5f27 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -19,7 +19,8 @@ __global__ void DeviceAPIInsert( thread_block g = this_thread_block(); for (size_t i = blockIdx.x * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size()) { - view.push(g, elements + i, min(view.get_node_size(), num_elements - i), + view.push(g, elements + i, + elements + i + min(view.get_node_size(), num_elements - i), shmem); } } @@ -34,7 +35,8 @@ __global__ void DeviceAPIDelete( thread_block g = this_thread_block(); for (size_t i = blockIdx.x * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + view.pop(g, out + i, + out + i + min(view.get_node_size(), num_elements - i), shmem); } } @@ -64,7 +66,7 @@ float TimeInsert(priority_queue &pq, Pair *d_elements, size_t num_keys) { return TimeCode([&]() { - pq.push(d_elements, num_keys); + pq.push(d_elements, d_elements + num_keys); }); } @@ -94,7 +96,7 @@ float TimeDelete(priority_queue &pq, Pair *d_elements, size_t num_keys) { return TimeCode([&]() { - pq.pop(d_elements, num_keys); + pq.pop(d_elements, d_elements + num_keys); }); } diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index d9cfa7f76..367709a04 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -53,8 +53,9 @@ priority_queue::~priority_queue() { template -void priority_queue::push(Pair *elements, - size_t num_elements, +template +void priority_queue::push(InputIt first, + InputIt last, int block_size, int grid_size, bool warp_level, @@ -66,12 +67,12 @@ void priority_queue::push(Pair *elements, if (!warp_level) { PushKernel<<>> - (elements, num_elements, d_heap_, d_size_, + (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_); } else { PushKernelWarp<<>> - (elements, num_elements, d_heap_, d_size_, + (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, get_shmem_size(32)); } @@ -80,8 +81,9 @@ void priority_queue::push(Pair *elements, } template -void priority_queue::pop(Pair *out, - size_t num_elements, +template +void priority_queue::pop(OutputIt first, + OutputIt last, int block_size, int grid_size, bool warp_level, @@ -94,13 +96,13 @@ void priority_queue::pop(Pair *out, if (!warp_level) { PopKernel<<>> - (out, num_elements, d_heap_, d_size_, + (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, d_pop_tracker_, lowest_level_start_, node_capacity_); } else { PopKernelWarp<<>> - (out, num_elements, d_heap_, d_size_, + (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, d_pop_tracker_, lowest_level_start_, node_capacity_, get_shmem_size(32)); @@ -111,32 +113,32 @@ void priority_queue::pop(Pair *out, } template -template +template __device__ void priority_queue::device_mutable_view::push( CG const& g, - Pair *elements, - size_t num_elements, + InputIt first, + InputIt last, void *temp_storage) { SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); - if (num_elements == node_size_) { - PushSingleNode(g, elements, d_heap_, d_size_, node_size_, + if (last - first == node_size_) { + PushSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, lowest_level_start_, shmem); - } else if (num_elements < node_size_) { - PushPartialNode(g, elements, num_elements, d_heap_, + } else if (last - first < node_size_) { + PushPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, shmem); } } template -template +template __device__ void priority_queue::device_mutable_view::pop( CG const& g, - Pair *out, - size_t num_elements, + OutputIt first, + OutputIt last, void *temp_storage) { int pop_tracker = 0; @@ -144,12 +146,12 @@ __device__ void priority_queue::device_mutable_view::pop( GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); - if (num_elements == node_size_) { - PopSingleNode(g, out, d_heap_, d_size_, node_size_, d_locks_, + if (last - first == node_size_) { + PopSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, &pop_tracker, lowest_level_start_, node_capacity_, shmem); } else { - PopPartialNode(g, out, num_elements, d_heap_, d_size_, node_size_, + PopPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, node_capacity_, shmem); } diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 1d9886f68..5ba688e46 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -74,21 +74,37 @@ __device__ void ReleaseLock(CG const& g, int *l) { } /** -* Copy node_size pairs from src to dst +* Copy pairs from src to dst * * @param g The cooperative group that will perform the copy -* @param dst Pointer to the beginning of the destination array -* @param src Pointer to the beginning of the source array +* @param dst_start Iterator to the beginning of the destination array +* @param src_start Iterator to the beginning of the source array +* @param src_end Iterator to the end of the source array */ -template -__device__ void CopyPairs(CG const& g, Pair *dst, - Pair *src, - size_t node_size) { - for (size_t i = g.thread_rank(); i < node_size; i += g.size()) { - dst[i] = src[i]; +template +__device__ void CopyPairs(CG const& g, InputIt1 dst_start, + InputIt2 src_start, InputIt2 src_end) { + auto dst = dst_start + g.thread_rank(); + for (auto src = src_start + g.thread_rank(); + src < src_end; dst += g.size(), src += g.size()) { + *dst = *src; } } +/** +* Copy node_size pairs from src to dst +* +* @param g The cooperative group that will perform the copy +* @param dst_start Iterator to the beginning of the destination array +* @param src_start Iterator to the beginning of the source array +* @param num_pairs Number of pairs to copy +*/ +template +__device__ void CopyPairs(CG const& g, InputIt1 dst_start, + InputIt2 src_start, size_t num_pairs) { + CopyPairs(g, dst_start, src_start, src_start + num_pairs); +} + /* * Compare two elements depending on whether this is a max or * min queue. @@ -676,7 +692,7 @@ __device__ void Sink(CG const& g, * elements * * @param g The cooperative group that will perform the push -* @param elements The array of elements to add +* @param elements Iterator for the elements to be inserted * @param heap The array of pairs that stores the heap itself * @param size Pointer to the number of pairs currently in the heap * @param node_size Size of the nodes in the heap @@ -685,9 +701,10 @@ __device__ void Sink(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushSingleNode(CG const& g, - Pair *elements, + InputIt elements, Pair *heap, int *size, size_t node_size, @@ -698,7 +715,7 @@ __device__ void PushSingleNode(CG const& g, int lane = g.thread_rank(); int dim = g.size(); - CopyPairs(g, shmem.A, elements, node_size); + CopyPairs(g, shmem.A, elements, elements + node_size); g.sync(); @@ -727,7 +744,8 @@ __device__ void PushSingleNode(CG const& g, * Remove exactly node_size elements from the heap and place them * in elements * -* @param elements The array of elements to insert into +* @param g The cooperative group that will perform the pop +* @param elements Iterator to the elements to write to * @param heap The array of pairs that stores the heap itself * @param size Pointer to the number of pairs currently in the heap * @param node_size Size of the nodes in the heap @@ -740,9 +758,10 @@ __device__ void PushSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopSingleNode(CG const& g, - Pair *elements, + OutputIt elements, Pair *heap, int *size, size_t node_size, @@ -781,8 +800,8 @@ __device__ void PopSingleNode(CG const& g, // Copy the root to the output array - CopyPairs(g, &elements[out_idx * node_size], &heap[node_size], - node_size); + CopyPairs(g, elements + out_idx * node_size, &heap[node_size], + &heap[node_size] + node_size); g.sync(); @@ -837,9 +856,10 @@ __device__ void PopSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopPartialNode(CG const& g, - Pair *elements, + InputIt elements, size_t num_elements, Pair *heap, int *size, @@ -982,9 +1002,10 @@ __device__ void PopPartialNode(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushPartialNode(CG const& g, - Pair *elements, + InputIt elements, size_t p_ins_size, Pair *heap, int *size, @@ -1105,8 +1126,8 @@ __device__ void PushPartialNode(CG const& g, * @param temp_node A temporary array large enough to store sizeof(Pair) * node_size bytes */ -template -__global__ void PushKernel(Pair *elements, +template +__global__ void PushKernel(OutputIt elements, size_t num_elements, Pair *heap, int *size, @@ -1162,8 +1183,9 @@ __global__ void PushKernel(Pair *elements, * @param temp_node A temporary array large enough to store sizeof(Pair) * node_size bytes */ -template -__global__ void PushKernelWarp(Pair *elements, +template +__global__ void PushKernelWarp(InputIt elements, size_t num_elements, Pair *heap, int *size, @@ -1223,8 +1245,8 @@ __global__ void PushKernelWarp(Pair *elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template -__global__ void PopKernelWarp(Pair *elements, +template +__global__ void PopKernelWarp(OutputIt elements, size_t num_elements, Pair *heap, int *size, @@ -1294,8 +1316,8 @@ __global__ void PopKernelWarp(Pair *elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template -__global__ void PopKernel(Pair *elements, +template +__global__ void PopKernel(OutputIt elements, size_t num_elements, Pair *heap, int *size, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 3bf0fa1d3..bcc0315bb 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -59,9 +59,12 @@ class priority_queue { priority_queue(size_t initial_capacity, size_t node_size = 1024); /** - * @brief Push num_elements elements into the priority queue + * @brief Push elements into the priority queue * - * @param elements Array of elements to add to the queue + * @tparam InputIt Device accessible input iterator whose `value_type` + * can be converted to Pair + * @param first Beginning of the sequence of elements + * @param last End of the sequence of elements * @param num_elements Number of elements to add to the queue * @param block_size Block size to use for the internal kernel launch * @param grid_size Grid size for the internal kernel launch @@ -70,16 +73,20 @@ class priority_queue { * @param stream The stream in which the underlying GPU operations will be * run */ - void push(Pair *elements, size_t num_elements, + template + void push(InputIt first, InputIt last, int block_size = 256, int grid_size = 64000, bool warp_level = false, cudaStream_t stream = 0); /** - * @brief Remove the num_elements elements with the lowest keys from the priority - * queue and place them in out in ascending sorted order by key + * @brief Remove a sequence of the lowest (when Max == false) or the + * highest (when Max == true) elements * - * @param out The array in which the removed elements will be placed + * @tparam OutputIt Device accessible output iterator whose `value_type` + * can be converted to Pair + * @param first Beginning of the sequence of output elements + * @param last End of the sequence of output elements * @param num_elements The number of elements to be removed * @param block_size Block size to use for the internal kernel launch * @param grid_size Grid size for the internal kernel launch @@ -88,7 +95,8 @@ class priority_queue { * @param stream The stream in which the underlying GPU operations will be * run */ - void pop(Pair *out, size_t num_elements, + template + void pop(OutputIt first, OutputIt last, int block_size = 512, int grid_size = 32000, bool warp_level = false, cudaStream_t stream = 0); @@ -118,29 +126,33 @@ class priority_queue { * @brief Push a single node or less elements into the priority queue * * @tparam CG Cooperative Group type + * @tparam Device accessible iterator whose `value_type` is convertible + * to Pair * @param g The cooperative group that will perform the operation - * @param elements Array of elements to add to the queue - * @param num_elements Number of elements to add to the queue + * @param first The beginning of the sequence of elements to insert + * @param last The end of the sequence of elements to insert * @param Pointer to a contiguous section of memory large enough * to hold get_shmem_size(g.size()) bytes */ - template - __device__ void push(CG const& g, Pair *elements, - size_t num_elements, void *temp_storage); + template + __device__ void push(CG const& g, InputIt first, + InputIt last, void *temp_storage); /** * @brief Pop a single node or less elements from the priority queue * * @tparam CG Cooperative Group type + * @tparam Device accessible iterator whose `value_type` is convertible to + Pair * @param g The cooperative group that will perform the operation - * @param out Array of elements to put the removed elements in - * @param num_elements Number of elements to remove from the queue + * @param first The beginning of the sequence of elements to output into + * @param last The end of the sequence of elements to output into * @param Pointer to a contiguous section of memory large enough * to hold get_shmem_size(g.size()) bytes */ - template - __device__ void pop(CG const& g, Pair *out, - size_t num_elements, void *temp_storage); + template + __device__ void pop(CG const& g, OutputIt first, + OutputIt last, void *temp_storage); /** * @brief Returns the node size of the queue's underlying heap diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index c982f46d5..1ca4b96ab 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -31,7 +31,7 @@ void Insert(priority_queue &pq, CUCO_CUDA_TRY(cudaMemcpy(d_elements, &elements[0], num_bytes, cudaMemcpyHostToDevice)); - pq.push(d_elements, elements.size(), 512, 32000, warp_level); + pq.push(d_elements, d_elements + elements.size(), 512, 32000, warp_level); CUCO_CUDA_TRY(cudaFree(d_elements)); } @@ -48,7 +48,7 @@ std::vector> Delete(priority_queue &pq, CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); - pq.pop(d_elements, num_elements, 512, 32, warp_level); + pq.pop(d_elements, d_elements + num_elements, 512, 32, warp_level); std::vector> result(num_elements); @@ -69,7 +69,8 @@ __global__ void DeviceAPIInsert( thread_block g = this_thread_block(); for (size_t i = blockIdx.x * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size()) { - view.push(g, elements + i, min(view.get_node_size(), num_elements - i), + view.push(g, elements + i, elements + i + + min(view.get_node_size(), num_elements - i), shmem); } } @@ -84,7 +85,8 @@ __global__ void DeviceAPIDelete( thread_block g = this_thread_block(); for (size_t i = blockIdx.x * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + view.pop(g, out + i, out + + i + min(view.get_node_size(), num_elements - i), shmem); } } @@ -101,7 +103,7 @@ __global__ void DeviceAPIInsertWarp( + warp.meta_group_rank() * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size() * blockDim.x / kWarpSize) { - view.push(warp, elements + i, min(view.get_node_size(), + view.push(warp, elements + i, elements + i + min(view.get_node_size(), num_elements - i), (char*)shmem + warp.meta_group_rank() * view.get_shmem_size(kWarpSize)); } @@ -116,7 +118,8 @@ __global__ void DeviceAPIDeleteWarp( thread_block g = this_thread_block(); for (size_t i = blockIdx.x * view.get_node_size(); i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, min(view.get_node_size(), num_elements - i), shmem); + view.pop(g, out + i, out + i + min(view.get_node_size(), + num_elements - i), shmem); } } // Each test case is composed of a name From 249165c03e6868c127a114f34b68c60a6c263eb8 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Thu, 16 Sep 2021 21:10:27 -0700 Subject: [PATCH 06/55] Test for iterators with thrust device_vector --- tests/priority_queue/priority_queue_test.cu | 40 +++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 1ca4b96ab..62c1ace20 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -13,6 +13,8 @@ #include +#include + using namespace cooperative_groups; using namespace cuco; @@ -1166,6 +1168,44 @@ TestCase cases[] = { return result; } }, + + {"test_insert_10M_thrust_device_vec_iterator", []() { + int num_keys = 10e6; + + srand(0); + + priority_queue pq(num_keys); + + std::vector std_vec; + + IntIntVector input(num_keys); + for (int i = 0; i < num_keys; i++) { + int32_t next = rand(); + input[i] = {next, 1}; + std_vec.push_back(next); + } + + thrust::device_vector> d_input(input); + + pq.push(d_input.begin(), d_input.end()); + + std::sort(std_vec.begin(), std_vec.end()); + + auto result_vec = Delete(pq, num_keys); + + bool result = true; + for (int i = 0; i < num_keys; i++) { + bool next = result_vec[i].key == std_vec[i]; + if (result && !next) { + std::cout << i << ": " << " expected " << std_vec[i] << " got " + << result_vec[i].key << std::endl; + } + result = result && next; + } + + return result; + } + }, }; int main() { From c28a5ad824bac1cd708e8d2b60d8dd9ab97f7bb4 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 19 Oct 2021 19:54:51 +0000 Subject: [PATCH 07/55] Add allocator template parameter --- include/cuco/detail/priority_queue.inl | 65 +++++++++++++++++--------- include/cuco/priority_queue.cuh | 8 +++- 2 files changed, 50 insertions(+), 23 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 367709a04..abd360919 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -6,11 +6,27 @@ namespace cuco { -template -priority_queue::priority_queue(size_t initial_capacity, - size_t node_size) { +template +priority_queue::priority_queue + (size_t initial_capacity, + size_t node_size, + Allocator const& allocator) : + allocator_{allocator} { node_size_ = node_size; + + using int_allocator_type = typename std::allocator_traits + ::rebind_alloc; + + using pair_allocator_type = typename std::allocator_traits + ::rebind_alloc>; + + using size_t_allocator_type = typename std::allocator_traits + ::rebind_alloc; + + int_allocator_type int_allocator{allocator}; + pair_allocator_type pair_allocator{allocator}; + size_t_allocator_type size_t_allocator{allocator}; // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); @@ -20,30 +36,35 @@ priority_queue::priority_queue(size_t initial_capacity, // Allocate device variables - CUCO_CUDA_TRY(cudaMalloc((void**)&d_size_, sizeof(int))); + d_size_ = std::allocator_traits::allocate(int_allocator, + (size_t)sizeof(int)); CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_p_buffer_size_, sizeof(size_t))); + d_p_buffer_size_ = std::allocator_traits::allocate( + size_t_allocator, + sizeof(size_t)); CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_heap_, - sizeof(Pair) - * (node_capacity_ * node_size_ + node_size_))); + d_heap_ = std::allocator_traits::allocate(pair_allocator, + sizeof(Pair) + * (node_capacity_ * node_size_ + node_size_)); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_locks_, - sizeof(int) * (node_capacity_ + 1))); + d_locks_ = std::allocator_traits::allocate(int_allocator, + sizeof(int) * (node_capacity_ + 1)); CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_pop_tracker_, sizeof(int))); + d_pop_tracker_ = std::allocator_traits::allocate( + int_allocator, + sizeof(int)); } -template -priority_queue::~priority_queue() { +template +priority_queue::~priority_queue() { CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_size_)); CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_p_buffer_size_)); CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_heap_)); @@ -52,9 +73,9 @@ priority_queue::~priority_queue() { } -template +template template -void priority_queue::push(InputIt first, +void priority_queue::push(InputIt first, InputIt last, int block_size, int grid_size, @@ -80,9 +101,9 @@ void priority_queue::push(InputIt first, CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, +void priority_queue::pop(OutputIt first, OutputIt last, int block_size, int grid_size, @@ -112,9 +133,10 @@ void priority_queue::pop(OutputIt first, CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -__device__ void priority_queue::device_mutable_view::push( +__device__ void priority_queue + ::device_mutable_view::push( CG const& g, InputIt first, InputIt last, @@ -133,9 +155,10 @@ __device__ void priority_queue::device_mutable_view::push( } } -template +template template -__device__ void priority_queue::device_mutable_view::pop( +__device__ void priority_queue + ::device_mutable_view::pop( CG const& g, OutputIt first, OutputIt last, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index bcc0315bb..b5d8c93b0 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -3,6 +3,7 @@ #include #include #include +#include namespace cuco { @@ -45,7 +46,8 @@ namespace cuco { * keys in the queue, otherwise, pop operations yeild the elements * with the largest keys */ -template +template > class priority_queue { public: @@ -56,7 +58,8 @@ class priority_queue { * @param node_size The size of the nodes in the underlying heap data * structure */ - priority_queue(size_t initial_capacity, size_t node_size = 1024); + priority_queue(size_t initial_capacity, size_t node_size = 1024, + Allocator const& alloc = Allocator{}); /** * @brief Push elements into the priority queue @@ -238,6 +241,7 @@ class priority_queue { int *d_pop_tracker_; ///< Variable used to track where in its output /// array a pop operation should place a given /// popped node + Allocator allocator_; }; } From e8a9c1e247a05718fd1c142461225db7e6c3de0b Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 20 Oct 2021 06:45:35 +0000 Subject: [PATCH 08/55] Allocator --- include/cuco/detail/priority_queue.inl | 57 ++++++++++++-------------- include/cuco/priority_queue.cuh | 12 ++++++ 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index abd360919..b042356a2 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -11,23 +11,13 @@ priority_queue::priority_queue (size_t initial_capacity, size_t node_size, Allocator const& allocator) : - allocator_{allocator} { + allocator_{allocator}, + int_allocator_{allocator}, + pair_allocator_{allocator}, + size_t_allocator_{allocator} { node_size_ = node_size; - using int_allocator_type = typename std::allocator_traits - ::rebind_alloc; - - using pair_allocator_type = typename std::allocator_traits - ::rebind_alloc>; - - using size_t_allocator_type = typename std::allocator_traits - ::rebind_alloc; - - int_allocator_type int_allocator{allocator}; - pair_allocator_type pair_allocator{allocator}; - size_t_allocator_type size_t_allocator{allocator}; - // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); @@ -36,40 +26,47 @@ priority_queue::priority_queue // Allocate device variables - d_size_ = std::allocator_traits::allocate(int_allocator, - (size_t)sizeof(int)); + d_size_ = std::allocator_traits::allocate(int_allocator_, + 1); CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); d_p_buffer_size_ = std::allocator_traits::allocate( - size_t_allocator, - sizeof(size_t)); + size_t_allocator_, + 1); CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); - d_heap_ = std::allocator_traits::allocate(pair_allocator, - sizeof(Pair) - * (node_capacity_ * node_size_ + node_size_)); + d_heap_ = std::allocator_traits::allocate(pair_allocator_, + node_capacity_ * node_size_ + node_size_); - d_locks_ = std::allocator_traits::allocate(int_allocator, - sizeof(int) * (node_capacity_ + 1)); + d_locks_ = std::allocator_traits::allocate(int_allocator_, + node_capacity_ + 1); CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); d_pop_tracker_ = std::allocator_traits::allocate( - int_allocator, - sizeof(int)); + int_allocator_, + 1); } template priority_queue::~priority_queue() { - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_size_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_p_buffer_size_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_heap_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_locks_)); - CUCO_ASSERT_CUDA_SUCCESS(cudaFree(d_pop_tracker_)); + std::allocator_traits::deallocate(int_allocator_, + d_size_, 1); + std::allocator_traits::deallocate(size_t_allocator_, + d_p_buffer_size_, 1); + std::allocator_traits::deallocate(pair_allocator_, + d_heap_, + node_capacity_ * node_size_ + node_size_); + std::allocator_traits::deallocate(int_allocator_, + d_locks_, + node_capacity_ + 1); + std::allocator_traits::deallocate(int_allocator_, + d_pop_tracker_, + 1); } diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index b5d8c93b0..be82952bf 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -50,6 +50,15 @@ template > class priority_queue { + using int_allocator_type = typename std::allocator_traits + ::rebind_alloc; + + using pair_allocator_type = typename std::allocator_traits + ::rebind_alloc>; + + using size_t_allocator_type = typename std::allocator_traits + ::rebind_alloc; + public: /** * @brief Construct a priority queue @@ -242,6 +251,9 @@ class priority_queue { /// array a pop operation should place a given /// popped node Allocator allocator_; + int_allocator_type int_allocator_; + pair_allocator_type pair_allocator_; + size_t_allocator_type size_t_allocator_; }; } From 012ebdedfcafaa7ff4ed956192923ba5577f3687 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 20 Oct 2021 19:44:01 +0000 Subject: [PATCH 09/55] Accept arbitrary comparison --- include/cuco/detail/priority_queue.inl | 58 ++--- .../cuco/detail/priority_queue_kernels.cuh | 233 +++++++++--------- include/cuco/priority_queue.cuh | 16 +- tests/priority_queue/priority_queue_test.cu | 10 +- 4 files changed, 167 insertions(+), 150 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index b042356a2..ce20fc7bc 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -6,8 +6,8 @@ namespace cuco { -template -priority_queue::priority_queue +template +priority_queue::priority_queue (size_t initial_capacity, size_t node_size, Allocator const& allocator) : @@ -52,8 +52,8 @@ priority_queue::priority_queue } -template -priority_queue::~priority_queue() { +template +priority_queue::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, d_size_, 1); std::allocator_traits::deallocate(size_t_allocator_, @@ -70,9 +70,9 @@ priority_queue::~priority_queue() { } -template +template template -void priority_queue::push(InputIt first, +void priority_queue::push(InputIt first, InputIt last, int block_size, int grid_size, @@ -83,24 +83,25 @@ void priority_queue::push(InputIt first, const int kNumBlocks = grid_size; if (!warp_level) { - PushKernel<<>> (first, last - first, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_); + node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, + compare_); } else { - PushKernelWarp<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, - lowest_level_start_, get_shmem_size(32)); + lowest_level_start_, get_shmem_size(32), compare_); } CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, +void priority_queue::pop(OutputIt first, OutputIt last, int block_size, int grid_size, @@ -112,27 +113,27 @@ void priority_queue::pop(OutputIt first, cudaMemset(d_pop_tracker_, 0, sizeof(int)); if (!warp_level) { - PopKernel<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, - d_pop_tracker_, lowest_level_start_, node_capacity_); + d_pop_tracker_, lowest_level_start_, node_capacity_, compare_); } else { - PopKernelWarp<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, d_pop_tracker_, lowest_level_start_, - node_capacity_, get_shmem_size(32)); + node_capacity_, get_shmem_size(32), compare_); } CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::push( CG const& g, InputIt first, @@ -143,18 +144,19 @@ __device__ void priority_queue GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); if (last - first == node_size_) { - PushSingleNode(g, first, d_heap_, d_size_, node_size_, - d_locks_, lowest_level_start_, shmem); + PushSingleNode(g, first, d_heap_, d_size_, node_size_, + d_locks_, lowest_level_start_, shmem, compare_); } else if (last - first < node_size_) { - PushPartialNode(g, first, last - first, d_heap_, + PushPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, d_locks_, - d_p_buffer_size_, lowest_level_start_, shmem); + d_p_buffer_size_, lowest_level_start_, shmem, + compare_); } } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::pop( CG const& g, OutputIt first, @@ -167,13 +169,13 @@ __device__ void priority_queue g.size(), node_size_); if (last - first == node_size_) { - PopSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, + PopSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, &pop_tracker, lowest_level_start_, - node_capacity_, shmem); + node_capacity_, shmem, compare_); } else { - PopPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, + PopPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, - node_capacity_, shmem); + node_capacity_, shmem, compare_); } } diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 5ba688e46..4485d4f89 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -105,24 +105,6 @@ __device__ void CopyPairs(CG const& g, InputIt1 dst_start, CopyPairs(g, dst_start, src_start, src_start + num_pairs); } -/* -* Compare two elements depending on whether this is a max or -* min queue. -* -* @param a The first element to be compared -* @param b The second element to be compared -* @returns If Max, returns true iff a.key >= b.key, -* otherwise returns true iff a.key <= b.key. -*/ -template -__device__ bool compare(const Pair& a, const Pair& b) { - if (Max) { - return a.key >= b.key; - } else { - return a.key <= b.key; - } -} - /** * Merge arrays a and b of size node_size by key, putting the * node_size elements with the lowest keys in lo, sorted by key, and the @@ -138,15 +120,17 @@ __device__ bool compare(const Pair& a, const Pair& b) { * @param node_size The size of arrays a, b, lo, and hi * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void MergeAndSort(CG const& g, Pair *a, Pair *b, Pair *lo, Pair *hi, size_t node_size, - SharedMemoryLayout shmem) { - MergeAndSort(g, a, b, lo, hi, node_size, node_size, node_size, shmem); + SharedMemoryLayout shmem, + Compare const& compare) { + MergeAndSort(g, a, b, lo, hi, node_size, + node_size, node_size, shmem, compare); } /** @@ -171,7 +155,7 @@ __device__ void MergeAndSort(CG const& g, * hi * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void MergeAndSort(CG const& g, Pair *a, Pair *b, @@ -180,13 +164,14 @@ __device__ void MergeAndSort(CG const& g, size_t num_elements_a, size_t num_elements_b, size_t node_size, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); if (num_elements_a == node_size && - compare(a[node_size - 1], b[0])) { + compare(a[node_size - 1].key, b[0].key)) { CopyPairs(g, lo, a, num_elements_a); @@ -195,7 +180,7 @@ __device__ void MergeAndSort(CG const& g, } if (num_elements_b == node_size && - compare(b[node_size - 1], a[0])) { + compare(b[node_size - 1].key, a[0].key)) { CopyPairs(g, hi, a, num_elements_a); @@ -241,7 +226,7 @@ __device__ void MergeAndSort(CG const& g, if (i >= num_elements_a) { leftmost_zero = i; - } else if (j >= num_elements_b || compare(a[i], b[j])) { + } else if (j >= num_elements_b || compare(a[i].key, b[j].key)) { rightmost_one = i; } else { leftmost_zero = i; @@ -272,7 +257,7 @@ __device__ void MergeAndSort(CG const& g, // Merge our partition into the output arrays while (i < i_max && j < j_max) { Pair next_element; - if (compare(a[i], b[j])) { + if (compare(a[i].key, b[j].key)) { next_element = a[i]; i++; } else { @@ -323,10 +308,11 @@ __device__ void MergeAndSort(CG const& g, * @param temp A temporary array containing space for at least the nearest * power of two greater than len pairs */ -template +template __device__ void PBSort(CG const& g, Pair *start, size_t len, size_t node_size, - Pair *temp) { + Pair *temp, + Compare const& compare) { int lane = g.thread_rank(); @@ -348,7 +334,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, int right = left + jump; if ((i / start_jump) % 2 == 0) { if (!mask[left] || (mask[right] && - !compare(start[left], start[right]))) { + !compare(start[left].key, start[right].key))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -359,7 +345,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, } } else { if (!mask[right] || (mask[left] - && compare(start[left], start[right]))) { + && compare(start[left].key, start[right].key))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -380,7 +366,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, int left = (i / jump) * jump * 2 + i % jump; int right = left + jump; if (!mask[left] || (mask[right] - && !compare(start[left], start[right]))) { + && !compare(start[left].key, start[right].key))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -506,7 +492,7 @@ __device__ int RightChild(int x, int lowest_level_start) { * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void Swim(CG const& g, int cur_node, Pair *heap, @@ -514,7 +500,8 @@ __device__ void Swim(CG const& g, size_t node_size, int *locks, int lowest_level_start, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -527,18 +514,19 @@ __device__ void Swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], - heap[parent * node_size + node_size - 1])) { + if (!compare(heap[cur_node * node_size].key, + heap[parent * node_size + node_size - 1].key)) { ReleaseLock(g, &(locks[parent])); break; } - MergeAndSort(g, &heap[parent * node_size], + MergeAndSort(g, &heap[parent * node_size], &heap[cur_node * node_size], shmem.A, shmem.B, node_size, - shmem); + shmem, + compare); g.sync(); @@ -568,7 +556,7 @@ __device__ void Swim(CG const& g, * @param node_capacity Max capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void Sink(CG const& g, Pair *heap, int *size, @@ -577,7 +565,8 @@ __device__ void Sink(CG const& g, size_t *p_buffer_size, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { size_t cur = kRootIdx; @@ -616,8 +605,8 @@ __device__ void Sink(CG const& g, // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left+1) * node_size - 1], - heap[(right+1) * node_size - 1])) { + if (!compare(heap[(left+1) * node_size - 1].key, + heap[(right+1) * node_size - 1].key)) { hi = left; lo = right; } else { @@ -627,14 +616,15 @@ __device__ void Sink(CG const& g, // Skip the merge and sort if the nodes are already correctly // sorted - if (!compare(heap[(lo+1) * node_size - 1], - heap[hi * node_size])) { - MergeAndSort(g, &heap[left * node_size], + if (!compare(heap[(lo+1) * node_size - 1].key, + heap[hi * node_size].key)) { + MergeAndSort(g, &heap[left * node_size], &heap[right * node_size], shmem.A, shmem.B, node_size, - shmem); + shmem, + compare); g.sync(); @@ -657,19 +647,20 @@ __device__ void Sink(CG const& g, // // TODO: can this ever even occur? In the paper this is done because // a max placeholder value is used to indicate unused nodes in the heap - if (!compare(heap[lo * node_size], - heap[(cur + 1) * node_size - 1])) { + if (!compare(heap[lo * node_size].key, + heap[(cur + 1) * node_size - 1].key)) { ReleaseLock(g, &locks[lo]); ReleaseLock(g, &locks[cur]); return; } - MergeAndSort(g, &heap[lo * node_size], + MergeAndSort(g, &heap[lo * node_size], &heap[cur * node_size], shmem.A, shmem.B, node_size, - shmem); + shmem, + compare); g.sync(); @@ -701,8 +692,8 @@ __device__ void Sink(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushSingleNode(CG const& g, InputIt elements, Pair *heap, @@ -710,7 +701,8 @@ __device__ void PushSingleNode(CG const& g, size_t node_size, int *locks, int lowest_level_start, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -719,7 +711,7 @@ __device__ void PushSingleNode(CG const& g, g.sync(); - PBSort(g, shmem.A, node_size, node_size, shmem.B); + PBSort(g, shmem.A, node_size, node_size, shmem.B, compare); int *cur_node_temp = (int*)shmem.intersections; if (lane == 0) { @@ -735,8 +727,8 @@ __device__ void PushSingleNode(CG const& g, g.sync(); - Swim(g, cur_node, heap, size, node_size, locks, - lowest_level_start, shmem); + Swim(g, cur_node, heap, size, node_size, locks, + lowest_level_start, shmem, compare); } @@ -758,8 +750,8 @@ __device__ void PushSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopSingleNode(CG const& g, OutputIt elements, Pair *heap, @@ -770,7 +762,8 @@ __device__ void PopSingleNode(CG const& g, int *pop_tracker, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -818,14 +811,15 @@ __device__ void PopSingleNode(CG const& g, // Merge and sort the root and the partial buffer - MergeAndSort(g, &heap[node_size], + MergeAndSort(g, &heap[node_size], &heap[kPBufferIdx], shmem.A, shmem.B, node_size, *p_buffer_size, node_size, - shmem); + shmem, + compare); g.sync(); @@ -835,8 +829,8 @@ __device__ void PopSingleNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem); + Sink(g, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem, compare); } @@ -856,8 +850,8 @@ __device__ void PopSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopPartialNode(CG const& g, InputIt elements, size_t num_elements, @@ -868,7 +862,8 @@ __device__ void PopPartialNode(CG const& g, size_t *p_buffer_size, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -905,14 +900,15 @@ __device__ void PopPartialNode(CG const& g, if (*p_buffer_size >= num_elements) { - MergeAndSort(g, &heap[kPBufferIdx], + MergeAndSort(g, &heap[kPBufferIdx], &heap[kRootIdx * node_size] + num_elements, shmem.A, shmem.B, *p_buffer_size, node_size - num_elements, node_size, - shmem); + shmem, + compare); if (lane == 0) { *p_buffer_size = *p_buffer_size - num_elements; @@ -925,18 +921,19 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem); + Sink(g, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem, compare); } else { - MergeAndSort(g, &heap[kPBufferIdx], + MergeAndSort(g, &heap[kPBufferIdx], &heap[kRootIdx * node_size] + num_elements, shmem.A, (Pair*)nullptr, *p_buffer_size, node_size - num_elements, node_size, - shmem); + shmem, + compare); g.sync(); @@ -960,14 +957,15 @@ __device__ void PopPartialNode(CG const& g, ReleaseLock(g, &locks[tar]); - MergeAndSort(g, &heap[node_size], + MergeAndSort(g, &heap[node_size], &heap[kPBufferIdx], shmem.A, shmem.B, node_size, *p_buffer_size, node_size, - shmem); + shmem, + compare); g.sync(); @@ -977,8 +975,9 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, - p_buffer_size, lowest_level_start, node_capacity, shmem); + Sink(g, heap, size, node_size, locks, + p_buffer_size, lowest_level_start, node_capacity, shmem, + compare); } else { ReleaseLock(g, &locks[kRootIdx]); } @@ -1002,8 +1001,8 @@ __device__ void PopPartialNode(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushPartialNode(CG const& g, InputIt elements, size_t p_ins_size, @@ -1013,7 +1012,8 @@ __device__ void PushPartialNode(CG const& g, int *locks, size_t *p_buffer_size, int lowest_level_start, - SharedMemoryLayout shmem) { + SharedMemoryLayout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -1022,7 +1022,7 @@ __device__ void PushPartialNode(CG const& g, CopyPairs(g, shmem.B, elements, p_ins_size); - PBSort(g, shmem.B, p_ins_size, node_size, shmem.A); + PBSort(g, shmem.B, p_ins_size, node_size, shmem.A, compare); // There is enough data for a new node, in which case we // construct a new node and insert it @@ -1042,14 +1042,15 @@ __device__ void PushPartialNode(CG const& g, g.sync(); - MergeAndSort(g, shmem.B, + MergeAndSort(g, shmem.B, &heap[kPBufferIdx], &heap[cur_node * node_size], shmem.A, p_ins_size, *p_buffer_size, node_size, - shmem); + shmem, + compare); if (lane == 0) { *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; @@ -1063,8 +1064,8 @@ __device__ void PushPartialNode(CG const& g, ReleaseLock(g, &locks[kRootIdx]); } - Swim(g, cur_node, heap, size, node_size, - locks, lowest_level_start, shmem); + Swim(g, cur_node, heap, size, node_size, + locks, lowest_level_start, shmem, compare); } else { // There are not enough elements for a new node, @@ -1072,14 +1073,15 @@ __device__ void PushPartialNode(CG const& g, // the elements to be inserted and then the root // and the partial buffer - MergeAndSort(g, shmem.B, + MergeAndSort(g, shmem.B, &heap[kPBufferIdx], shmem.A, (Pair *)nullptr, p_ins_size, *p_buffer_size, node_size, - shmem); + shmem, + compare); g.sync(); @@ -1094,14 +1096,15 @@ __device__ void PushPartialNode(CG const& g, g.sync(); if (*size > 0) { - MergeAndSort(g, &heap[node_size], + MergeAndSort(g, &heap[node_size], &heap[kPBufferIdx], shmem.A, shmem.B, node_size, *p_buffer_size, node_size, - shmem); + shmem, + compare); g.sync(); CopyPairs(g, heap, shmem.B, *p_buffer_size); @@ -1126,7 +1129,7 @@ __device__ void PushPartialNode(CG const& g, * @param temp_node A temporary array large enough to store sizeof(Pair) * node_size bytes */ -template +template __global__ void PushKernel(OutputIt elements, size_t num_elements, Pair *heap, @@ -1134,7 +1137,8 @@ __global__ void PushKernel(OutputIt elements, size_t node_size, int *locks, size_t *p_buffer_size, - int lowest_level_start) { + int lowest_level_start, + Compare const& compare) { extern __shared__ int s[]; @@ -1148,8 +1152,8 @@ __global__ void PushKernel(OutputIt elements, for (size_t i = blockIdx.x * node_size; i + node_size <= num_elements; i += gridDim.x * node_size) { - PushSingleNode(g, elements + i, heap, size, node_size, locks, - lowest_level_start, shmem); + PushSingleNode(g, elements + i, heap, size, node_size, locks, + lowest_level_start, shmem, compare); } // We only need one block for partial insertion @@ -1164,9 +1168,9 @@ __global__ void PushKernel(OutputIt elements, if (first_not_inserted < num_elements) { size_t p_ins_size = num_elements - first_not_inserted; - PushPartialNode(g, elements + first_not_inserted, p_ins_size, + PushPartialNode(g, elements + first_not_inserted, p_ins_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, shmem); + lowest_level_start, shmem, compare); } } @@ -1183,8 +1187,8 @@ __global__ void PushKernel(OutputIt elements, * @param temp_node A temporary array large enough to store sizeof(Pair) * node_size bytes */ -template +template __global__ void PushKernelWarp(InputIt elements, size_t num_elements, Pair *heap, @@ -1193,7 +1197,8 @@ __global__ void PushKernelWarp(InputIt elements, int *locks, size_t *p_buffer_size, int lowest_level_start, - int bytes_shmem_per_warp) { + int bytes_shmem_per_warp, + Compare const& compare) { extern __shared__ char sh[]; @@ -1211,8 +1216,8 @@ __global__ void PushKernelWarp(InputIt elements, + blockIdx.x * node_size * (blockDim.x / 32); i + node_size <= num_elements; i += (blockDim.x / 32) * node_size * gridDim.x) { - PushSingleNode(warp, elements + i, heap, size, node_size, locks, - lowest_level_start, shmem); + PushSingleNode(warp, elements + i, heap, size, node_size, locks, + lowest_level_start, shmem, compare); } // We only need one block for partial insertion @@ -1227,9 +1232,9 @@ __global__ void PushKernelWarp(InputIt elements, if (first_not_inserted < num_elements) { size_t p_ins_size = num_elements - first_not_inserted; - PushPartialNode(warp, elements + first_not_inserted, p_ins_size, + PushPartialNode(warp, elements + first_not_inserted, p_ins_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, shmem); + lowest_level_start, shmem, compare); } } @@ -1245,7 +1250,7 @@ __global__ void PushKernelWarp(InputIt elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template +template __global__ void PopKernelWarp(OutputIt elements, size_t num_elements, Pair *heap, @@ -1256,7 +1261,8 @@ __global__ void PopKernelWarp(OutputIt elements, int *pop_tracker, int lowest_level_start, int node_capacity, - int bytes_shmem_per_warp) { + int bytes_shmem_per_warp, + Compare const& compare) { // We use pop_tracker to ensure that each thread block inserts its node // at the correct location in the output array @@ -1274,9 +1280,9 @@ __global__ void PopKernelWarp(OutputIt elements, for (size_t i = warp.meta_group_rank() + (blockDim.x / 32) * blockIdx.x; i < num_elements / node_size; i += gridDim.x * blockDim.x / 32) { - PopSingleNode(warp, elements, heap, size, node_size, locks, + PopSingleNode(warp, elements, heap, size, node_size, locks, p_buffer_size, pop_tracker, lowest_level_start, - node_capacity, shmem); + node_capacity, shmem, compare); } AcquireLock(warp, &locks[kRootIdx]); @@ -1294,10 +1300,10 @@ __global__ void PopKernelWarp(OutputIt elements, ReleaseLock(warp, &locks[kRootIdx]); - PopPartialNode(warp, + PopPartialNode(warp, elements + (num_elements / node_size) * node_size, p_del_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem); + lowest_level_start, node_capacity, shmem, compare); } else { ReleaseLock(warp, &locks[kRootIdx]); @@ -1316,7 +1322,7 @@ __global__ void PopKernelWarp(OutputIt elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template +template __global__ void PopKernel(OutputIt elements, size_t num_elements, Pair *heap, @@ -1326,7 +1332,8 @@ __global__ void PopKernel(OutputIt elements, size_t *p_buffer_size, int *pop_tracker, int lowest_level_start, - int node_capacity) { + int node_capacity, + Compare const& compare) { // We use pop_tracker to ensure that each thread block inserts its node // at the correct location in the output array @@ -1339,9 +1346,9 @@ __global__ void PopKernel(OutputIt elements, thread_block g = this_thread_block(); for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { - PopSingleNode(g, elements, heap, size, node_size, locks, + PopSingleNode(g, elements, heap, size, node_size, locks, p_buffer_size, pop_tracker, lowest_level_start, - node_capacity, shmem); + node_capacity, shmem, compare); } AcquireLock(g, &locks[kRootIdx]); @@ -1359,9 +1366,9 @@ __global__ void PopKernel(OutputIt elements, ReleaseLock(g, &locks[kRootIdx]); - PopPartialNode(g, elements + (num_elements / node_size) * node_size, + PopPartialNode(g, elements + (num_elements / node_size) * node_size, p_del_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem); + lowest_level_start, node_capacity, shmem, compare); } else { ReleaseLock(g, &locks[kRootIdx]); diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index be82952bf..94fd2f27f 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -5,6 +5,8 @@ #include #include +#include + namespace cuco { /* @@ -46,7 +48,7 @@ namespace cuco { * keys in the queue, otherwise, pop operations yeild the elements * with the largest keys */ -template , typename Allocator = cuco::cuda_allocator> class priority_queue { @@ -197,14 +199,16 @@ class priority_queue { size_t *d_p_buffer_size, int *d_locks, int lowest_level_start, - int node_capacity) + int node_capacity, + Compare const& compare) : node_size_(node_size), d_heap_(d_heap), d_size_(d_size), d_p_buffer_size_(d_p_buffer_size), d_locks_(d_locks), lowest_level_start_(lowest_level_start), - node_capacity_(node_capacity) + node_capacity_(node_capacity), + compare_(compare) { } @@ -217,6 +221,7 @@ class priority_queue { int *d_size_; size_t *d_p_buffer_size_; int *d_locks_; + Compare compare_; }; /* @@ -228,7 +233,8 @@ class priority_queue { */ device_mutable_view get_mutable_device_view() { return device_mutable_view(node_size_, d_heap_, d_size_, d_p_buffer_size_, - d_locks_, lowest_level_start_, node_capacity_); + d_locks_, lowest_level_start_, node_capacity_, + compare_); } private: @@ -254,6 +260,8 @@ class priority_queue { int_allocator_type int_allocator_; pair_allocator_type pair_allocator_; size_t_allocator_type size_t_allocator_; + + Compare compare_{}; }; } diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 62c1ace20..5e364a5c8 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -20,8 +20,8 @@ using namespace cuco; // Inserts elements into pq, managing memory allocation // and copying to the device -template -void Insert(priority_queue &pq, +template +void Insert(priority_queue &pq, const std::vector> &elements, bool warp_level = false) { Pair *d_elements; @@ -40,8 +40,8 @@ void Insert(priority_queue &pq, // Deletes num_elements elements from pq and returns them, // managing device memory -template -std::vector> Delete(priority_queue &pq, +template +std::vector> Delete(priority_queue &pq, size_t num_elements, bool warp_level = false) { Pair *d_elements; @@ -723,7 +723,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; From 8cf681a7a619730dcca520bad266c12844cd1c28 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 24 Oct 2021 22:03:29 +0000 Subject: [PATCH 10/55] Accept arbitrary types instead of just pairs --- include/cuco/detail/pq_pair.cuh | 13 ++ include/cuco/detail/priority_queue.inl | 38 ++--- .../cuco/detail/priority_queue_kernels.cuh | 157 +++++++++--------- include/cuco/priority_queue.cuh | 26 +-- tests/priority_queue/priority_queue_test.cu | 74 +++++---- 5 files changed, 161 insertions(+), 147 deletions(-) diff --git a/include/cuco/detail/pq_pair.cuh b/include/cuco/detail/pq_pair.cuh index 5edf31dfa..8208b87b5 100644 --- a/include/cuco/detail/pq_pair.cuh +++ b/include/cuco/detail/pq_pair.cuh @@ -1,5 +1,6 @@ #pragma once + namespace cuco { template @@ -18,4 +19,16 @@ bool operator==(const Pair &a, const Pair &b) { return a.key == b.key && a.value == b.value; } + +template +__device__ __host__ bool operator>(const Pair &a, const Pair &b) { + return a.key > b.key; } + +template +__device__ __host__ bool operator<(const Pair &a, const Pair &b) { + return a.key < b.key; +} + +} + diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index ce20fc7bc..0092c9c91 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -6,14 +6,14 @@ namespace cuco { -template -priority_queue::priority_queue +template +priority_queue::priority_queue (size_t initial_capacity, size_t node_size, Allocator const& allocator) : allocator_{allocator}, int_allocator_{allocator}, - pair_allocator_{allocator}, + t_allocator_{allocator}, size_t_allocator_{allocator} { node_size_ = node_size; @@ -37,7 +37,7 @@ priority_queue::priority_queue CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); - d_heap_ = std::allocator_traits::allocate(pair_allocator_, + d_heap_ = std::allocator_traits::allocate(t_allocator_, node_capacity_ * node_size_ + node_size_); d_locks_ = std::allocator_traits::allocate(int_allocator_, @@ -52,13 +52,13 @@ priority_queue::priority_queue } -template -priority_queue::~priority_queue() { +template +priority_queue::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, d_size_, 1); std::allocator_traits::deallocate(size_t_allocator_, d_p_buffer_size_, 1); - std::allocator_traits::deallocate(pair_allocator_, + std::allocator_traits::deallocate(t_allocator_, d_heap_, node_capacity_ * node_size_ + node_size_); std::allocator_traits::deallocate(int_allocator_, @@ -70,9 +70,9 @@ priority_queue::~priority_queue() { } -template +template template -void priority_queue::push(InputIt first, +void priority_queue::push(InputIt first, InputIt last, int block_size, int grid_size, @@ -99,9 +99,9 @@ void priority_queue::push(InputIt first, CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, +void priority_queue::pop(OutputIt first, OutputIt last, int block_size, int grid_size, @@ -131,17 +131,17 @@ void priority_queue::pop(OutputIt first, CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::push( CG const& g, InputIt first, InputIt last, void *temp_storage) { - SharedMemoryLayout shmem = - GetSharedMemoryLayout((int*)temp_storage, + SharedMemoryLayout shmem = + GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); if (last - first == node_size_) { PushSingleNode(g, first, d_heap_, d_size_, node_size_, @@ -154,9 +154,9 @@ __device__ void priority_queue } } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::pop( CG const& g, OutputIt first, @@ -164,8 +164,8 @@ __device__ void priority_queue void *temp_storage) { int pop_tracker = 0; - SharedMemoryLayout shmem = - GetSharedMemoryLayout((int*)temp_storage, + SharedMemoryLayout shmem = + GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); if (last - first == node_size_) { diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 4485d4f89..f33ba8c13 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -16,11 +16,11 @@ constexpr int kRootIdx = 1; * queue's kernels and functions. * Ideally, this temp storage is in shared memory */ -template +template struct SharedMemoryLayout { int *intersections; - Pair *A; - Pair *B; + T *A; + T *B; }; /* @@ -33,13 +33,13 @@ struct SharedMemoryLayout { * @param node_size Size of the nodes in this priority queue * @returns The memory layout for the given group dimension and node size */ -template -__device__ SharedMemoryLayout GetSharedMemoryLayout( +template +__device__ SharedMemoryLayout GetSharedMemoryLayout( int *s, int dim, size_t node_size) { - SharedMemoryLayout result; + SharedMemoryLayout result; result.intersections = s; - result.A = (Pair*)(s + 2 * (dim + 1)); + result.A = (T*)(s + 2 * (dim + 1)); result.B = result.A + node_size; return result; } @@ -120,14 +120,14 @@ __device__ void CopyPairs(CG const& g, InputIt1 dst_start, * @param node_size The size of arrays a, b, lo, and hi * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void MergeAndSort(CG const& g, - Pair *a, - Pair *b, - Pair *lo, - Pair *hi, + T *a, + T *b, + T *lo, + T *hi, size_t node_size, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { MergeAndSort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); @@ -155,23 +155,23 @@ __device__ void MergeAndSort(CG const& g, * hi * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void MergeAndSort(CG const& g, - Pair *a, - Pair *b, - Pair *lo, - Pair *hi, + T *a, + T *b, + T *lo, + T *hi, size_t num_elements_a, size_t num_elements_b, size_t node_size, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); if (num_elements_a == node_size && - compare(a[node_size - 1].key, b[0].key)) { + compare(a[node_size - 1], b[0])) { CopyPairs(g, lo, a, num_elements_a); @@ -180,7 +180,7 @@ __device__ void MergeAndSort(CG const& g, } if (num_elements_b == node_size && - compare(b[node_size - 1].key, a[0].key)) { + compare(b[node_size - 1], a[0])) { CopyPairs(g, hi, a, num_elements_a); @@ -226,7 +226,7 @@ __device__ void MergeAndSort(CG const& g, if (i >= num_elements_a) { leftmost_zero = i; - } else if (j >= num_elements_b || compare(a[i].key, b[j].key)) { + } else if (j >= num_elements_b || compare(a[i], b[j])) { rightmost_one = i; } else { leftmost_zero = i; @@ -256,8 +256,8 @@ __device__ void MergeAndSort(CG const& g, // Merge our partition into the output arrays while (i < i_max && j < j_max) { - Pair next_element; - if (compare(a[i].key, b[j].key)) { + T next_element; + if (compare(a[i], b[j])) { next_element = a[i]; i++; } else { @@ -308,10 +308,10 @@ __device__ void MergeAndSort(CG const& g, * @param temp A temporary array containing space for at least the nearest * power of two greater than len pairs */ -template -__device__ void PBSort(CG const& g, Pair *start, size_t len, +template +__device__ void PBSort(CG const& g, T *start, size_t len, size_t node_size, - Pair *temp, + T *temp, Compare const& compare) { @@ -334,7 +334,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, int right = left + jump; if ((i / start_jump) % 2 == 0) { if (!mask[left] || (mask[right] && - !compare(start[left].key, start[right].key))) { + !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -345,7 +345,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, } } else { if (!mask[right] || (mask[left] - && compare(start[left].key, start[right].key))) { + && compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -366,7 +366,7 @@ __device__ void PBSort(CG const& g, Pair *start, size_t len, int left = (i / jump) * jump * 2 + i % jump; int right = left + jump; if (!mask[left] || (mask[right] - && !compare(start[left].key, start[right].key))) { + && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -492,15 +492,15 @@ __device__ int RightChild(int x, int lowest_level_start) { * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void Swim(CG const& g, int cur_node, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, int lowest_level_start, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); @@ -514,8 +514,8 @@ __device__ void Swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size].key, - heap[parent * node_size + node_size - 1].key)) { + if (!compare(heap[cur_node * node_size], + heap[parent * node_size + node_size - 1])) { ReleaseLock(g, &(locks[parent])); break; } @@ -556,16 +556,16 @@ __device__ void Swim(CG const& g, * @param node_capacity Max capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void Sink(CG const& g, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { size_t cur = kRootIdx; @@ -605,8 +605,8 @@ __device__ void Sink(CG const& g, // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left+1) * node_size - 1].key, - heap[(right+1) * node_size - 1].key)) { + if (!compare(heap[(left+1) * node_size - 1], + heap[(right+1) * node_size - 1])) { hi = left; lo = right; } else { @@ -616,8 +616,8 @@ __device__ void Sink(CG const& g, // Skip the merge and sort if the nodes are already correctly // sorted - if (!compare(heap[(lo+1) * node_size - 1].key, - heap[hi * node_size].key)) { + if (!compare(heap[(lo+1) * node_size - 1], + heap[hi * node_size])) { MergeAndSort(g, &heap[left * node_size], &heap[right * node_size], shmem.A, @@ -647,8 +647,8 @@ __device__ void Sink(CG const& g, // // TODO: can this ever even occur? In the paper this is done because // a max placeholder value is used to indicate unused nodes in the heap - if (!compare(heap[lo * node_size].key, - heap[(cur + 1) * node_size - 1].key)) { + if (!compare(heap[lo * node_size], + heap[(cur + 1) * node_size - 1])) { ReleaseLock(g, &locks[lo]); ReleaseLock(g, &locks[cur]); return; @@ -692,16 +692,15 @@ __device__ void Sink(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushSingleNode(CG const& g, InputIt elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, int lowest_level_start, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); @@ -750,11 +749,10 @@ __device__ void PushSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopSingleNode(CG const& g, OutputIt elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, @@ -762,7 +760,7 @@ __device__ void PopSingleNode(CG const& g, int *pop_tracker, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); @@ -850,19 +848,18 @@ __device__ void PopSingleNode(CG const& g, * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PopPartialNode(CG const& g, InputIt elements, size_t num_elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -928,7 +925,7 @@ __device__ void PopPartialNode(CG const& g, MergeAndSort(g, &heap[kPBufferIdx], &heap[kRootIdx * node_size] + num_elements, shmem.A, - (Pair*)nullptr, + (T*)nullptr, *p_buffer_size, node_size - num_elements, node_size, @@ -1001,18 +998,17 @@ __device__ void PopPartialNode(CG const& g, * heap * @param shmem The shared memory layout for this cooperative group */ -template +template __device__ void PushPartialNode(CG const& g, InputIt elements, size_t p_ins_size, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, int lowest_level_start, - SharedMemoryLayout shmem, + SharedMemoryLayout shmem, Compare const& compare) { int lane = g.thread_rank(); @@ -1076,7 +1072,7 @@ __device__ void PushPartialNode(CG const& g, MergeAndSort(g, shmem.B, &heap[kPBufferIdx], shmem.A, - (Pair *)nullptr, + (T*)nullptr, p_ins_size, *p_buffer_size, node_size, @@ -1127,22 +1123,22 @@ __device__ void PushPartialNode(CG const& g, * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer * @param temp_node A temporary array large enough to store - sizeof(Pair) * node_size bytes + sizeof(T) * node_size bytes */ -template +template __global__ void PushKernel(OutputIt elements, size_t num_elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, int lowest_level_start, - Compare const& compare) { + Compare compare) { extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); // We push as many elements as possible as full nodes, @@ -1185,20 +1181,19 @@ __global__ void PushKernel(OutputIt elements, * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer * @param temp_node A temporary array large enough to store - sizeof(Pair) * node_size bytes + sizeof(T) * node_size bytes */ -template +template __global__ void PushKernelWarp(InputIt elements, size_t num_elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, int lowest_level_start, int bytes_shmem_per_warp, - Compare const& compare) { + Compare compare) { extern __shared__ char sh[]; @@ -1208,7 +1203,7 @@ __global__ void PushKernelWarp(InputIt elements, thread_block block = this_thread_block(); thread_block_tile<32> warp = tiled_partition<32>(block); - SharedMemoryLayout shmem = GetSharedMemoryLayout( + SharedMemoryLayout shmem = GetSharedMemoryLayout( (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), 32, node_size); @@ -1250,10 +1245,10 @@ __global__ void PushKernelWarp(InputIt elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template +template __global__ void PopKernelWarp(OutputIt elements, size_t num_elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, @@ -1262,7 +1257,7 @@ __global__ void PopKernelWarp(OutputIt elements, int lowest_level_start, int node_capacity, int bytes_shmem_per_warp, - Compare const& compare) { + Compare compare) { // We use pop_tracker to ensure that each thread block inserts its node // at the correct location in the output array @@ -1273,7 +1268,7 @@ __global__ void PopKernelWarp(OutputIt elements, thread_block block = this_thread_block(); thread_block_tile<32> warp = tiled_partition<32>(block); - SharedMemoryLayout shmem = GetSharedMemoryLayout( + SharedMemoryLayout shmem = GetSharedMemoryLayout( (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), 32, node_size); @@ -1322,10 +1317,10 @@ __global__ void PopKernelWarp(OutputIt elements, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param pop_tracker Pointer to an integer in global memory initialized to 0 */ -template +template __global__ void PopKernel(OutputIt elements, size_t num_elements, - Pair *heap, + T *heap, int *size, size_t node_size, int *locks, @@ -1333,7 +1328,7 @@ __global__ void PopKernel(OutputIt elements, int *pop_tracker, int lowest_level_start, int node_capacity, - Compare const& compare) { + Compare compare) { // We use pop_tracker to ensure that each thread block inserts its node // at the correct location in the output array @@ -1341,7 +1336,7 @@ __global__ void PopKernel(OutputIt elements, extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); thread_block g = this_thread_block(); diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 94fd2f27f..160db412d 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -48,15 +48,15 @@ namespace cuco { * keys in the queue, otherwise, pop operations yeild the elements * with the largest keys */ -template , +template , typename Allocator = cuco::cuda_allocator> class priority_queue { using int_allocator_type = typename std::allocator_traits ::rebind_alloc; - using pair_allocator_type = typename std::allocator_traits - ::rebind_alloc>; + using t_allocator_type = typename std::allocator_traits + ::rebind_alloc; using size_t_allocator_type = typename std::allocator_traits ::rebind_alloc; @@ -76,7 +76,7 @@ class priority_queue { * @brief Push elements into the priority queue * * @tparam InputIt Device accessible input iterator whose `value_type` - * can be converted to Pair + * can be converted to T * @param first Beginning of the sequence of elements * @param last End of the sequence of elements * @param num_elements Number of elements to add to the queue @@ -98,7 +98,7 @@ class priority_queue { * highest (when Max == true) elements * * @tparam OutputIt Device accessible output iterator whose `value_type` - * can be converted to Pair + * can be converted to T * @param first Beginning of the sequence of output elements * @param last End of the sequence of output elements * @param num_elements The number of elements to be removed @@ -124,7 +124,7 @@ class priority_queue { */ int get_shmem_size(int block_size) { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); - int node_bytes = node_size_ * sizeof(Pair); + int node_bytes = node_size_ * sizeof(T); return intersection_bytes + 2 * node_bytes; } @@ -141,7 +141,7 @@ class priority_queue { * * @tparam CG Cooperative Group type * @tparam Device accessible iterator whose `value_type` is convertible - * to Pair + * to T * @param g The cooperative group that will perform the operation * @param first The beginning of the sequence of elements to insert * @param last The end of the sequence of elements to insert @@ -157,7 +157,7 @@ class priority_queue { * * @tparam CG Cooperative Group type * @tparam Device accessible iterator whose `value_type` is convertible to - Pair + T * @param g The cooperative group that will perform the operation * @param first The beginning of the sequence of elements to output into * @param last The end of the sequence of elements to output into @@ -189,12 +189,12 @@ class priority_queue { */ __device__ int get_shmem_size(int block_size) { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); - int node_bytes = node_size_ * sizeof(Pair); + int node_bytes = node_size_ * sizeof(T); return intersection_bytes + 2 * node_bytes; } __host__ __device__ device_mutable_view(size_t node_size, - Pair *d_heap, + T *d_heap, int *d_size, size_t *d_p_buffer_size, int *d_locks, @@ -217,7 +217,7 @@ class priority_queue { int lowest_level_start_; int node_capacity_; - Pair *d_heap_; + T *d_heap_; int *d_size_; size_t *d_p_buffer_size_; int *d_locks_; @@ -243,7 +243,7 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - Pair *d_heap_; ///< Pointer to an array of nodes, the 0th node + T *d_heap_; ///< Pointer to an array of nodes, the 0th node /// being the heap's partial buffer, and nodes /// 1..(node_capacity_) being the heap, where the /// 1st node is the root @@ -258,7 +258,7 @@ class priority_queue { /// popped node Allocator allocator_; int_allocator_type int_allocator_; - pair_allocator_type pair_allocator_; + t_allocator_type t_allocator_; size_t_allocator_type size_t_allocator_; Compare compare_{}; diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 5e364a5c8..86517df2c 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -21,7 +21,7 @@ using namespace cuco; // Inserts elements into pq, managing memory allocation // and copying to the device template -void Insert(priority_queue &pq, +void Insert(priority_queue, Compare> &pq, const std::vector> &elements, bool warp_level = false) { Pair *d_elements; @@ -41,7 +41,7 @@ void Insert(priority_queue &pq, // Deletes num_elements elements from pq and returns them, // managing device memory template -std::vector> Delete(priority_queue &pq, +std::vector> Delete(priority_queue, Compare> &pq, size_t num_elements, bool warp_level = false) { Pair *d_elements; @@ -64,7 +64,7 @@ std::vector> Delete(priority_queue &pq, template __global__ void DeviceAPIInsert( - typename priority_queue::device_mutable_view view, + typename priority_queue>::device_mutable_view view, Pair *elements, size_t num_elements) { extern __shared__ int shmem[]; @@ -79,7 +79,7 @@ __global__ void DeviceAPIInsert( template __global__ void DeviceAPIDelete( - typename priority_queue::device_mutable_view view, + typename priority_queue>::device_mutable_view view, Pair *out, size_t num_elements) { @@ -94,7 +94,7 @@ __global__ void DeviceAPIDelete( template __global__ void DeviceAPIInsertWarp( - typename priority_queue::device_mutable_view view, + typename priority_queue>::device_mutable_view view, Pair *elements, size_t num_elements) { extern __shared__ int shmem[]; @@ -113,7 +113,7 @@ __global__ void DeviceAPIInsertWarp( template __global__ void DeviceAPIDeleteWarp( - typename priority_queue::device_mutable_view view, + typename priority_queue>::device_mutable_view view, Pair *out, size_t num_elements) { extern __shared__ int shmem[]; @@ -141,7 +141,7 @@ using FloatIntVector = std::vector>; TestCase cases[] = { {"test_insert_1", []() { - priority_queue pq(1000); + priority_queue> pq(1000); IntIntVector result = {{1, 1}}; Insert(pq, {{1, 1}}); return Delete(pq, 1) == result; @@ -149,14 +149,15 @@ TestCase cases[] = { }, {"test_insert_descending_seq", []() { - const int kNodeSize = 1024; + const int kNodeSize = 8; srand(0); // Choose some reasonably large number of elements int count = rand() % 1000000 + 10000; + //int count = 9; - priority_queue pq(count, kNodeSize); + priority_queue> pq(count, kNodeSize); IntIntVector input; @@ -174,7 +175,11 @@ TestCase cases[] = { Insert(pq, {e}); } - return Delete(pq, count) == result; + auto out = Delete(pq, count); + //for (int i = 0; i < out.size(); i++) { + // std::cout << out[i].key << " " << result[i].key << std::endl; + //} + return out == result; } }, @@ -184,7 +189,7 @@ TestCase cases[] = { // Choose some number of elements less than the node size int count = rand() % kNodeSize; - priority_queue pq(count, kNodeSize); + priority_queue> pq(count, kNodeSize); IntIntVector input; @@ -227,7 +232,7 @@ TestCase cases[] = { // buffer int count = 600; - priority_queue pq(count * 2, kNodeSize); + priority_queue> pq(count * 2, kNodeSize); IntIntVector input; IntIntVector input2; @@ -269,7 +274,7 @@ TestCase cases[] = { // individual elements int count = rand() % kNodeSize; - priority_queue pq(count, kNodeSize); + priority_queue> pq(count, kNodeSize); IntIntVector input; @@ -296,7 +301,7 @@ TestCase cases[] = { // Choose some reasonably large number of keys int count = rand() % 1000000 + 10000; - priority_queue pq(count, kNodeSize); + priority_queue> pq(count, kNodeSize); IntIntVector input; @@ -340,7 +345,7 @@ TestCase cases[] = { // tests full node insertion int count = rand() % kNodeSize * 100 + 10 * kNodeSize; - priority_queue pq(count, kNodeSize); + priority_queue> pq(count, kNodeSize); IntIntVector input; @@ -381,7 +386,7 @@ TestCase cases[] = { // Choose some reasonably large number of nodes const int kNodes = rand() % 1000 + 50; - priority_queue pq(kNodeSize * kNodes, kNodeSize); + priority_queue> pq(kNodeSize * kNodes, kNodeSize); for (int i = kNodes - 1; i >= 0; i--) { @@ -415,7 +420,7 @@ TestCase cases[] = { const int kNodeSize = 1024; // Choose some reasonably large number of nodes const int kNodes = rand() % 1000 + 50; - priority_queue pq(kNodeSize * kNodes, kNodeSize); + priority_queue> pq(kNodeSize * kNodes, kNodeSize); for (int i = kNodes - 1; i >= 0; i--) { @@ -456,7 +461,7 @@ TestCase cases[] = { // Choose some reasonably large number of keys int count = rand() % 100000 + 10000; - priority_queue pq(count); + priority_queue> pq(count); IntLongVector input; @@ -497,7 +502,7 @@ TestCase cases[] = { // Choose some reasonably large number of keys int count = rand() % 100000 + 10000; - priority_queue pq(count); + priority_queue> pq(count); FloatIntVector input; @@ -537,7 +542,7 @@ TestCase cases[] = { // Choose some reasonably large number of keys int count = rand() % 100000 + 10000; - priority_queue pq(count); + priority_queue> pq(count); IntIntVector input(count); for (int i = 0; i < count; i++) { @@ -571,7 +576,7 @@ TestCase cases[] = { // Choose some reasonably large number of keys int count = rand() % 100000 + 10000; - priority_queue pq(count); + priority_queue> pq(count); // Create some elements with negative and very large // and very small keys @@ -614,7 +619,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; @@ -650,7 +655,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; @@ -687,7 +692,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys, kNodeSize); + priority_queue> pq(num_keys, kNodeSize); std::vector std_vec; @@ -723,7 +728,8 @@ TestCase cases[] = { srand(0); - priority_queue> pq(num_keys); + priority_queue, + thrust::greater>> pq(num_keys); std::vector std_vec; @@ -760,7 +766,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys, kNodeSize); + priority_queue> pq(num_keys, kNodeSize); std::vector std_vec; @@ -797,7 +803,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; @@ -834,7 +840,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; @@ -876,7 +882,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; IntIntVector input; @@ -932,7 +938,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); IntIntVector std_vec; @@ -993,7 +999,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys, kNodeSize); + priority_queue> pq(num_keys, kNodeSize); IntIntVector std_vec; @@ -1053,7 +1059,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); IntIntVector std_vec; @@ -1114,7 +1120,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys, kNodeSize); + priority_queue> pq(num_keys, kNodeSize); IntIntVector std_vec; @@ -1174,7 +1180,7 @@ TestCase cases[] = { srand(0); - priority_queue pq(num_keys); + priority_queue> pq(num_keys); std::vector std_vec; From 8485bece591d6193a77ab97415a6016d540bd775 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Tue, 2 Nov 2021 23:39:50 +0000 Subject: [PATCH 11/55] Remove pq_pair.h --- .../cuco/detail/priority_queue_kernels.cuh | 1 - include/cuco/priority_queue.cuh | 1 - tests/priority_queue/priority_queue_test.cu | 27 +++++++++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index f33ba8c13..aa2bbc26f 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -1,6 +1,5 @@ #pragma once -#include #include #include diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 160db412d..3f4a3cb5a 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -2,7 +2,6 @@ #include #include -#include #include #include diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 86517df2c..9ce393735 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -15,6 +15,33 @@ #include +template +struct Pair { + Key key; + Value value; +}; + +/* +* Check if two Pairs have the same key and value +* @param a The first Pair +* @param b The second Pair +*/ +template +bool operator==(const Pair &a, const Pair &b) { + return a.key == b.key && a.value == b.value; +} + + +template +__device__ __host__ bool operator>(const Pair &a, const Pair &b) { + return a.key > b.key; +} + +template +__device__ __host__ bool operator<(const Pair &a, const Pair &b) { + return a.key < b.key; +} + using namespace cooperative_groups; using namespace cuco; From da608cc91b5de143e9fe25a9aaa40d59b32f9677 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Tue, 2 Nov 2021 23:40:16 +0000 Subject: [PATCH 12/55] Start porting priority queue benchmark to gbenchmark --- benchmarks/CMakeLists.txt | 6 +- .../priority_queue/priority_queue_bench.cu | 213 +++++------------- include/cuco/detail/pq_pair.cuh | 34 --- 3 files changed, 55 insertions(+), 198 deletions(-) delete mode 100644 include/cuco/detail/pq_pair.cuh diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 70e7084ba..adb21a117 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -51,6 +51,6 @@ ConfigureBench(STATIC_MAP_BENCH "${STATIC_MAP_BENCH_SRC}") set(RBK_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/reduce_by_key/reduce_by_key.cu") ConfigureBench(RBK_BENCH "${RBK_BENCH_SRC}") -#TODO: Port priority_queue benchmark to google benchmark -add_executable(PRIORITY_QUEUE_BENCH "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu") -target_link_libraries(PRIORITY_QUEUE_BENCH cuco) +################################################################################################### +set(PRIORITY_QUEUE_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_bench.cu") +ConfigureBench(PRIORITY_QUEUE_BENCH "${PRIORITY_QUEUE_BENCH_SRC}") diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 9d85b5f27..1e41031cd 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -3,187 +3,78 @@ #include #include +#include + #include #include #include +#include + +#include using namespace cuco; -template -__global__ void DeviceAPIInsert( - typename priority_queue::device_mutable_view view, - Pair *elements, - size_t num_elements) { - extern __shared__ int shmem[]; - thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * view.get_node_size(); - i < num_elements; i += gridDim.x * view.get_node_size()) { - view.push(g, elements + i, - elements + i + min(view.get_node_size(), num_elements - i), - shmem); +template +struct pair_less { + __host__ __device__ bool operator()(const T& a, const T& b) const { + return a.first < b.first; } -} +}; -template -__global__ void DeviceAPIDelete( - typename priority_queue::device_mutable_view view, - Pair *out, - size_t num_elements) { +constexpr int NUM_KEYS = 128e6; - extern __shared__ int shmem[]; - thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * view.get_node_size(); - i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, - out + i + min(view.get_node_size(), num_elements - i), shmem); +template +static void BM_insert(::benchmark::State& state) +{ + srand(0); + for (auto _ : state) { + state.PauseTiming(); + priority_queue, pair_less>> pq(NUM_KEYS); + std::vector> h_pairs(NUM_KEYS); + for (auto &p : h_pairs) { + p = {rand(), rand()}; + } + thrust::device_vector> d_pairs(h_pairs); + state.ResumeTiming(); + pq.push(d_pairs.begin(), d_pairs.end()); + cudaDeviceSynchronize(); } + } -// Use CUDA events to time the code in the lambda function -template -float TimeCode(F f) { - cudaEvent_t t1; - CUCO_CUDA_TRY(cudaEventCreate(&t1)); - - cudaEvent_t t2; - CUCO_CUDA_TRY(cudaEventCreate(&t2)); - - CUCO_CUDA_TRY(cudaEventRecord(t1)); - f(); - CUCO_CUDA_TRY(cudaEventRecord(t2)); - - CUCO_CUDA_TRY(cudaEventSynchronize(t1)); - CUCO_CUDA_TRY(cudaEventSynchronize(t2)); - - float result; - CUCO_CUDA_TRY(cudaEventElapsedTime(&result, t1, t2)); - return result; -} - -// Time the insertion of the num_keys elements at d_elements into pq in ms -float TimeInsert(priority_queue &pq, - Pair *d_elements, - size_t num_keys) { - return TimeCode([&]() { - pq.push(d_elements, d_elements + num_keys); - }); -} - -// Time insert of the num_keys elements with the device API at d_elements -// into pq in ms -float TimeInsertDeviceAPI(priority_queue &pq, - Pair *d_elements, - size_t num_keys) { - return TimeCode([&]() { - DeviceAPIInsert<<<64000, 256, pq.get_shmem_size(256)>>> - (pq.get_mutable_device_view(), d_elements, num_keys); - }); -} - -// Time the deletion of num_keys elements from pq in ms -float TimeDeleteDeviceAPI(priority_queue &pq, - Pair *d_elements, - size_t num_keys) { - return TimeCode([&]() { - DeviceAPIDelete<<<32000, 512, pq.get_shmem_size(512)>>> - (pq.get_mutable_device_view(), d_elements, num_keys); - }); -} - -// Time the deletion of num_keys elements from pq in ms -float TimeDelete(priority_queue &pq, - Pair *d_elements, - size_t num_keys) { - return TimeCode([&]() { - pq.pop(d_elements, d_elements + num_keys); - }); -} - -// Follow the first experiment in the paper, -// inserting 512 million 4-byte keys and then deleting them all -// Repeat in ascending, descending and random key order -void InsertThenDelete() { - - std::cout << "==Insert then delete==" << std::endl; - - size_t num_keys = 512e6; - - std::cout << num_keys << " keys" << std::endl; - - std::cout << "Order\t\tInsertion (ms)\t\tDeletion (ms)" << std::endl; - - // Allocate GPU memory to store the keys that will be inserted - Pair *d_elements; - size_t num_bytes = num_keys * sizeof(Pair); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); - - priority_queue pq(num_keys); - - // Ascending - std::vector> ascending(num_keys); - - for (uint32_t i = 0; i < num_keys; i++) { - ascending[i] = {i, i}; - } - - CUCO_CUDA_TRY(cudaMemcpy(d_elements, &ascending[0], - num_bytes, cudaMemcpyHostToDevice)); - - auto time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); - auto time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); - - std::cout << "Ascend\t\t" << time_elapsed_insert << "\t\t" - << time_elapsed_delete << std::endl; - - // Descending - std::vector> descending(num_keys); - - for (uint32_t i = 0; i < num_keys; i++) { - descending[num_keys - i - 1] = {i, i}; - } - - CUCO_CUDA_TRY(cudaMemcpy(d_elements, &descending[0], - num_bytes, cudaMemcpyHostToDevice)); - - time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); - time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); - - std::cout << "Descend\t\t" << time_elapsed_insert << "\t\t" - << time_elapsed_delete << std::endl; - - // Random - std::vector> random(num_keys); - - for (uint32_t i = 0; i < num_keys; i++) { - random[i] = {(uint32_t)rand(), i}; +template +static void BM_delete(::benchmark::State& state) +{ + srand(0); + for (auto _ : state) { + state.PauseTiming(); + priority_queue, pair_less>> pq(NUM_KEYS); + std::vector> h_pairs(NUM_KEYS); + for (auto &p : h_pairs) { + p = {rand(), rand()}; + } + thrust::device_vector> d_pairs(h_pairs); + pq.push(d_pairs.begin(), d_pairs.end()); + cudaDeviceSynchronize(); + state.ResumeTiming(); + pq.pop(d_pairs.begin(), d_pairs.end()); + cudaDeviceSynchronize(); } + +} - CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0], - num_bytes, cudaMemcpyHostToDevice)); - - time_elapsed_insert = TimeInsert(pq, d_elements, num_keys); - time_elapsed_delete = TimeDelete(pq, d_elements, num_keys); - - std::cout << "Random\t\t" << time_elapsed_insert << "\t\t" - << time_elapsed_delete << std::endl; - - CUCO_CUDA_TRY(cudaMemcpy(d_elements, &random[0], - num_bytes, cudaMemcpyHostToDevice)); - - time_elapsed_insert = TimeInsertDeviceAPI(pq, d_elements, num_keys); - time_elapsed_delete = TimeDeleteDeviceAPI(pq, d_elements, num_keys); - - std::cout << "Random Dev. API\t\t" << time_elapsed_insert << "\t\t" - << time_elapsed_delete << std::endl; +BENCHMARK_TEMPLATE(BM_insert, int, int) + ->Unit(benchmark::kMillisecond); - CUCO_CUDA_TRY(cudaFree(d_elements)); -} +BENCHMARK_TEMPLATE(BM_delete, int, int) + ->Unit(benchmark::kMillisecond); +/* int main() { InsertThenDelete(); return 0; -} +}*/ diff --git a/include/cuco/detail/pq_pair.cuh b/include/cuco/detail/pq_pair.cuh deleted file mode 100644 index 8208b87b5..000000000 --- a/include/cuco/detail/pq_pair.cuh +++ /dev/null @@ -1,34 +0,0 @@ -#pragma once - - -namespace cuco { - -template -struct Pair { - Key key; - Value value; -}; - -/* -* Check if two Pairs have the same key and value -* @param a The first pair -* @param b The second pair -*/ -template -bool operator==(const Pair &a, const Pair &b) { - return a.key == b.key && a.value == b.value; -} - - -template -__device__ __host__ bool operator>(const Pair &a, const Pair &b) { - return a.key > b.key; -} - -template -__device__ __host__ bool operator<(const Pair &a, const Pair &b) { - return a.key < b.key; -} - -} - From 8a11b7f39d0b7f7f4fabebfa368183836d673506 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 3 Nov 2021 00:12:27 +0000 Subject: [PATCH 13/55] Finish porting priority queue benchmark to gbenchmark --- .../priority_queue/priority_queue_bench.cu | 62 ++++++++++--------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 1e41031cd..f21b9b31a 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -1,14 +1,10 @@ -#include #include #include -#include +#include #include -#include - #include -#include #include #include @@ -22,20 +18,30 @@ struct pair_less { } }; -constexpr int NUM_KEYS = 128e6; +template +static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { + std::random_device rd; + std::mt19937 gen{rd()}; + + auto num_keys = std::distance(output_begin, output_end); + + for (auto i = 0; i < num_keys; ++i) { + output_begin[i] = {static_cast(gen()), static_cast(gen())}; + } +} -template +template static void BM_insert(::benchmark::State& state) { - srand(0); for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NUM_KEYS); - std::vector> h_pairs(NUM_KEYS); - for (auto &p : h_pairs) { - p = {rand(), rand()}; - } + + priority_queue, pair_less>> pq(NumKeys); + + std::vector> h_pairs(NumKeys); + generate_keys_uniform(h_pairs.begin(), h_pairs.end()); thrust::device_vector> d_pairs(h_pairs); + state.ResumeTiming(); pq.push(d_pairs.begin(), d_pairs.end()); cudaDeviceSynchronize(); @@ -43,20 +49,21 @@ static void BM_insert(::benchmark::State& state) } -template +template static void BM_delete(::benchmark::State& state) { - srand(0); for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NUM_KEYS); - std::vector> h_pairs(NUM_KEYS); - for (auto &p : h_pairs) { - p = {rand(), rand()}; - } + + priority_queue, pair_less>> pq(NumKeys); + + std::vector> h_pairs(NumKeys); + generate_keys_uniform(h_pairs.begin(), h_pairs.end()); thrust::device_vector> d_pairs(h_pairs); + pq.push(d_pairs.begin(), d_pairs.end()); cudaDeviceSynchronize(); + state.ResumeTiming(); pq.pop(d_pairs.begin(), d_pairs.end()); cudaDeviceSynchronize(); @@ -64,17 +71,14 @@ static void BM_delete(::benchmark::State& state) } -BENCHMARK_TEMPLATE(BM_insert, int, int) +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int) +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000) ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000) + ->Unit(benchmark::kMillisecond); -/* -int main() { - - InsertThenDelete(); - - return 0; -}*/ +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000) + ->Unit(benchmark::kMillisecond); From d1392b97395b493095ced2bde94c68afb2832dc3 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sat, 18 Dec 2021 04:43:35 +0000 Subject: [PATCH 14/55] Add multiple node sizes to benchmark --- .../priority_queue/priority_queue_bench.cu | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index f21b9b31a..75ac1a8c5 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -30,13 +30,13 @@ static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { } } -template +template static void BM_insert(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NumKeys); + priority_queue, pair_less>> pq(NumKeys, NodeSize); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -49,13 +49,13 @@ static void BM_insert(::benchmark::State& state) } -template +template static void BM_delete(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NumKeys); + priority_queue, pair_less>> pq(NumKeys, NodeSize); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -71,14 +71,26 @@ static void BM_delete(::benchmark::State& state) } -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000) +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, 1024) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000) +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, 1024) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000) +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, 1024) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000) +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, 1024) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, 64) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, 64) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, 64) + ->Unit(benchmark::kMillisecond); + +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, 64) ->Unit(benchmark::kMillisecond); From 9ee6c8b4e3359b46bbbc6b96d924c87ce67b33c4 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sat, 18 Dec 2021 04:45:39 +0000 Subject: [PATCH 15/55] Start porting tests to Catch2 --- tests/CMakeLists.txt | 7 +- tests/priority_queue/priority_queue_test.cu | 1244 +------------------ 2 files changed, 48 insertions(+), 1203 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b54b38d00..19c3f30cd 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -45,7 +45,6 @@ set(DYNAMIC_MAP_TEST_SRC ConfigureTest(DYNAMIC_MAP_TEST "${DYNAMIC_MAP_TEST_SRC}") #################################################################################################### -# TODO: Port priority_queue tests to Catch2 -add_executable(PRIORITY_QUEUE_TEST - "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_test.cu") -target_link_libraries(PRIORITY_QUEUE_TEST cuco) +set(PRIORITY_QUEUE_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/priority_queue/priority_queue_test.cu") +ConfigureTest(PRIORITY_QUEUE_TEST "${PRIORITY_QUEUE_TEST_SRC}") diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 9ce393735..77e080acb 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -14,1234 +15,79 @@ #include #include +#include +#include -template -struct Pair { - Key key; - Value value; -}; - -/* -* Check if two Pairs have the same key and value -* @param a The first Pair -* @param b The second Pair -*/ -template -bool operator==(const Pair &a, const Pair &b) { - return a.key == b.key && a.value == b.value; -} - - -template -__device__ __host__ bool operator>(const Pair &a, const Pair &b) { - return a.key > b.key; -} - -template -__device__ __host__ bool operator<(const Pair &a, const Pair &b) { - return a.key < b.key; -} - -using namespace cooperative_groups; using namespace cuco; -// Inserts elements into pq, managing memory allocation -// and copying to the device -template -void Insert(priority_queue, Compare> &pq, - const std::vector> &elements, - bool warp_level = false) { - Pair *d_elements; - - size_t num_bytes = sizeof(Pair) * elements.size(); - - CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); +template +bool test_insertion_and_deletion(priority_queue &pq, + std::vector &elements) { + thrust::device_vector d_elements(elements); - CUCO_CUDA_TRY(cudaMemcpy(d_elements, &elements[0], num_bytes, - cudaMemcpyHostToDevice)); + pq.push(d_elements.begin(), d_elements.end()); - pq.push(d_elements, d_elements + elements.size(), 512, 32000, warp_level); + cudaDeviceSynchronize(); - CUCO_CUDA_TRY(cudaFree(d_elements)); -} - -// Deletes num_elements elements from pq and returns them, -// managing device memory -template -std::vector> Delete(priority_queue, Compare> &pq, - size_t num_elements, - bool warp_level = false) { - Pair *d_elements; - - size_t num_bytes = sizeof(Pair) * num_elements; + pq.pop(d_elements.begin(), d_elements.end()); - CUCO_CUDA_TRY(cudaMalloc((void**)&d_elements, num_bytes)); + cudaDeviceSynchronize(); - pq.pop(d_elements, d_elements + num_elements, 512, 32, warp_level); + thrust::host_vector popped_elements(d_elements); - std::vector> result(num_elements); + std::unordered_set popped_element_set(popped_elements.begin(), + popped_elements.end()); - CUCO_CUDA_TRY(cudaMemcpy(&result[0], d_elements, num_bytes, - cudaMemcpyDeviceToHost)); - - CUCO_CUDA_TRY(cudaFree(d_elements)); + bool result = true; + for (auto &e : elements) { + result = result && (popped_element_set.find(e) + != popped_element_set.end()); + } return result; } -template -__global__ void DeviceAPIInsert( - typename priority_queue>::device_mutable_view view, - Pair *elements, - size_t num_elements) { - extern __shared__ int shmem[]; - thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * view.get_node_size(); - i < num_elements; i += gridDim.x * view.get_node_size()) { - view.push(g, elements + i, elements + i - + min(view.get_node_size(), num_elements - i), - shmem); - } -} - -template -__global__ void DeviceAPIDelete( - typename priority_queue>::device_mutable_view view, - Pair *out, - size_t num_elements) { - - extern __shared__ int shmem[]; - thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * view.get_node_size(); - i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, out + - i + min(view.get_node_size(), num_elements - i), shmem); - } -} +template +static void generate_elements(OutputIt output_begin, OutputIt output_end) { + auto num_keys = std::distance(output_begin, output_end); -template -__global__ void DeviceAPIInsertWarp( - typename priority_queue>::device_mutable_view view, - Pair *elements, - size_t num_elements) { - extern __shared__ int shmem[]; - const int kWarpSize = 32; - thread_block g = this_thread_block(); - thread_block_tile warp = tiled_partition(g); - for (size_t i = blockIdx.x * view.get_node_size() * (blockDim.x / kWarpSize) - + warp.meta_group_rank() * view.get_node_size(); - i < num_elements; - i += gridDim.x * view.get_node_size() * blockDim.x / kWarpSize) { - view.push(warp, elements + i, elements + i + min(view.get_node_size(), - num_elements - i), (char*)shmem + warp.meta_group_rank() - * view.get_shmem_size(kWarpSize)); - } -} + std::random_device rd; + std::mt19937 gen{rd()}; -template -__global__ void DeviceAPIDeleteWarp( - typename priority_queue>::device_mutable_view view, - Pair *out, - size_t num_elements) { - extern __shared__ int shmem[]; - thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * view.get_node_size(); - i < num_elements; i += gridDim.x * view.get_node_size()) { - view.pop(g, out + i, out + i + min(view.get_node_size(), - num_elements - i), shmem); + for (auto i = 0; i < num_keys; i++) { + output_begin[i] = static_cast(gen()); } } -// Each test case is composed of a name -// and a function that returns true when the test -// passes and false when it fails -struct TestCase { - std::string name; - bool (*func)(); -}; - -using IntIntVector = std::vector>; - -using IntLongVector = std::vector>; - -using FloatIntVector = std::vector>; - -TestCase cases[] = { - - {"test_insert_1", []() { - priority_queue> pq(1000); - IntIntVector result = {{1, 1}}; - Insert(pq, {{1, 1}}); - return Delete(pq, 1) == result; - } - }, - - {"test_insert_descending_seq", []() { - const int kNodeSize = 8; - - srand(0); - - // Choose some reasonably large number of elements - int count = rand() % 1000000 + 10000; - //int count = 9; - - priority_queue> pq(count, kNodeSize); - - IntIntVector input; - - for (int i = count - 1; i >= 0; i--) { - input.push_back({i, i}); - } - - IntIntVector result; - - for (int i = 0; i < count; i++) { - result.push_back({i, i}); - } - - for (auto e : input) { - Insert(pq, {e}); - } - - auto out = Delete(pq, count); - //for (int i = 0; i < out.size(); i++) { - // std::cout << out[i].key << " " << result[i].key << std::endl; - //} - return out == result; - } - }, - - {"test_delete_from_p_buffer", []() { - const int kNodeSize = 1024; - - // Choose some number of elements less than the node size - int count = rand() % kNodeSize; - - priority_queue> pq(count, kNodeSize); - - IntIntVector input; - - for (int i = count - 1; i >= 0; i--) { - input.push_back({i, i}); - } - - IntIntVector result; - - for (int i = 0; i < count; i++) { - result.push_back({i, i}); - } - - for (auto e : input) { - Insert(pq, {e}); - } - - bool pass = true; - for (int i = 0; i < count; i++) { - auto next_el = Delete(pq, 1)[0]; - bool next = next_el == result[i]; - if (!pass || !next) { - std::cout << "i=" << i << ": expected " << result[i].key - << " " << result[i].value << " got " << next_el.key - << " " << next_el.value << std::endl; - } - pass = pass && next; - } - - return pass; - } - }, - - {"test_partial_insert_new_node", []() { - const int kNodeSize = 1024; - - // We choose count = 600 so that two partial insertions - // of size count will cause a new node to be created - // (600 + 600) = 1200 > 1024 so not all elements can fit in the partial - // buffer - int count = 600; - - priority_queue> pq(count * 2, kNodeSize); - - IntIntVector input; - IntIntVector input2; - for (int i = 0; i < count; i++) { - input.push_back({i, i}); - input2.push_back({i + count, i + count}); - } - - Insert(pq, input); - Insert(pq, input2); - - auto delete1 = Delete(pq, kNodeSize); - for (int i = 0; i < kNodeSize; i++) { - if (delete1[i].key != i) { - std::cout << "Error at i = " << i + kNodeSize << std::endl; - return false; - } - } - - auto delete2 = Delete(pq, count * 2 - kNodeSize); - for (int i = 0; i < count * 2 - kNodeSize; i++) { - if (delete2[i].key != i + kNodeSize) { - std::cout << "Error at i = " << i + kNodeSize << std::endl; - return false; - } - } - - return true; - } - }, - - {"test_insert_descending_bulk", []() { - const int kNodeSize = 1024; - - srand(0); - - // Choose some reasonably large number of keys, - // less than node size to test partial insertion of - // individual elements - int count = rand() % kNodeSize; - - priority_queue> pq(count, kNodeSize); - - IntIntVector input; - - for (int i = count - 1; i >= 0; i--) { - input.push_back({i, i}); - } - - IntIntVector result; - - for (int i = 0; i < count; i++) { - result.push_back({i, i}); - } - - Insert(pq, input); - - return Delete(pq, count) == result; - } - }, - - {"test_insert_random_seq", []() { - const int kNodeSize = 1024; - srand(0); - - // Choose some reasonably large number of keys - int count = rand() % 1000000 + 10000; - - priority_queue> pq(count, kNodeSize); - - IntIntVector input; - - for (int i = 0; i < count; i++) { - input.push_back({rand(), i}); - } - - IntIntVector result = input; - - std::sort(result.begin(), result.end(), - [](const Pair &a, - const Pair &b) { - return a.key < b.key; - } - ); - - for (auto e : input) { - Insert(pq, {e}); - } - - auto output = Delete(pq, count); - bool pass = true; - for (int i = 0; i < count; i++) { - if (output[i].key != result[i].key) { - std::cout << "Expected " << result[i].key << " " << result[i].value - << " got " << output[i].key << " " << output[i].value - << std::endl; - pass = false; - } - } - return pass; - } - }, - - {"test_insert_random_bulk", []() { - const int kNodeSize = 1024; - srand(0); - - // Choose some reasonably large number of keys, - // A multiple of node size so that this test only - // tests full node insertion - int count = rand() % kNodeSize * 100 + 10 * kNodeSize; - - priority_queue> pq(count, kNodeSize); - - IntIntVector input; - - for (int i = 0; i < count; i++) { - input.push_back({rand(), i}); - } - - IntIntVector result = input; - - std::sort(result.begin(), result.end(), - [](const Pair &a, - const Pair &b) { - return a.key < b.key; - } - ); - - Insert(pq, input); - - auto output = Delete(pq, count); - bool pass = true; - for (int i = 0; i < count; i++) { - if (output[i].key != result[i].key) { - std::cout << "Expected " << result[i].key << " " << result[i].value - << " got " << output[i].key << " " << output[i].value - << std::endl; - pass = false; - } - } - return pass; - } - }, - - {"test_insert_descending_bulk_2", []() { - srand(0); - - const int kNodeSize = 1024; - - // Choose some reasonably large number of nodes - const int kNodes = rand() % 1000 + 50; - - priority_queue> pq(kNodeSize * kNodes, kNodeSize); - - for (int i = kNodes - 1; i >= 0; i--) { - - IntIntVector input; - for (int j = kNodeSize - 1; j >= 0; j--) { - input.push_back({i * kNodeSize + j, 1}); - } - Insert(pq, input); - } - - IntIntVector deletion = Delete(pq, kNodeSize); - - bool result = true; - - for (int i = 0; i < kNodeSize; i++) { - result = result && (deletion[i].key == i); - } - - deletion = Delete(pq, kNodeSize * (kNodes - 1)); - - for (int i = kNodeSize; i < kNodes * kNodeSize; i++) { - result = result && (deletion[i - kNodeSize].key == i); - } - - return result; - } - }, - - {"test_insert_shuffled_bulk_2", []() { - srand(0); - const int kNodeSize = 1024; - // Choose some reasonably large number of nodes - const int kNodes = rand() % 1000 + 50; - priority_queue> pq(kNodeSize * kNodes, kNodeSize); - - for (int i = kNodes - 1; i >= 0; i--) { - - IntIntVector input(kNodeSize); - for (int j = kNodeSize - 1; j >= 0; j--) { - // Shuffle each input vector by putting even numbers - // in the first half and odd numbers in the second half - if (j % 2 == 0) { - input[j / 2] = {i * kNodeSize + j, 1}; - } else { - input[kNodeSize / 2 + (j / 2)] = {i * kNodeSize + j, 1}; - } - } - Insert(pq, input); - } - - IntIntVector deletion = Delete(pq, kNodeSize); - - bool result = true; - - for (int i = 0; i < kNodeSize; i++) { - result = result && (deletion[i].key == i); - } - - deletion = Delete(pq, kNodeSize * (kNodes - 1)); - - for (int i = kNodeSize; i < kNodes * kNodeSize; i++) { - result = result && (deletion[i - kNodeSize].key == i); - } - - return result; - } - }, - - {"test_insert_random_seq_long_val", []() { - srand(0); - - // Choose some reasonably large number of keys - int count = rand() % 100000 + 10000; - - priority_queue> pq(count); - - IntLongVector input; - - for (int i = 0; i < count; i++) { - input.push_back({rand(), i}); - } - - IntLongVector result = input; - - std::sort(result.begin(), result.end(), - [](const Pair &a, - const Pair &b) { - return a.key < b.key; - } - ); - - for (auto e : input) { - Insert(pq, {e}); - } - - auto output = Delete(pq, count); - bool pass = true; - for (int i = 0; i < count; i++) { - if (output[i].key != result[i].key) { - std::cout << "Expected " << result[i].key << " " << result[i].value - << " got " << output[i].key << " " << output[i].value - << std::endl; - pass = false; - } - } - return pass; - } - }, - - {"test_insert_random_seq_float", []() { - srand(0); - - // Choose some reasonably large number of keys - int count = rand() % 100000 + 10000; - - priority_queue> pq(count); - - FloatIntVector input; - - for (int i = 0; i < count; i++) { - input.push_back({(float)rand() / RAND_MAX, i}); - } - - FloatIntVector result = input; - - std::sort(result.begin(), result.end(), - [](const Pair &a, - const Pair &b) { - return a.key < b.key; - } - ); - - for (auto e : input) { - Insert(pq, {e}); - } - - auto output = Delete(pq, count); - bool pass = true; - for (int i = 0; i < count; i++) { - if (output[i].key != result[i].key) { - std::cout << "Expected " << result[i].key << " " << result[i].value - << " got " << output[i].key << " " << output[i].value - << std::endl; - pass = false; - } - } - return pass; - } - }, - - {"test_insert_all_same_key", []() { - srand(0); - // Choose some reasonably large number of keys - int count = rand() % 100000 + 10000; - - priority_queue> pq(count); - - IntIntVector input(count); - for (int i = 0; i < count; i++) { - input[i] = {1, i}; - } - - Insert(pq, input); - - IntIntVector result = Delete(pq, count); - - // Check if all the values were retained - std::vector values(count, false); - - for (auto r : result) { - values[r.value] = true; - } - - bool pass = true; - for (bool b : values) { - pass = pass && b; - } - - return pass; - } - }, - - {"test_insert_negatives_and_limits", []() { - - srand(0); - - // Choose some reasonably large number of keys - int count = rand() % 100000 + 10000; - - priority_queue> pq(count); - - // Create some elements with negative and very large - // and very small keys - IntIntVector elements = {{INT32_MAX, 1}, {-100, 1}, {100, 1}, {0, 1}, - {INT32_MIN, 1}, {-1000000, 1}}; - - IntIntVector input; - - for (int i = 0; i < count; i++) { - input.push_back(elements[rand() % elements.size()]); - } - - IntIntVector result = input; - - std::sort(result.begin(), result.end(), - [](const Pair &a, - const Pair &b) { - return a.key < b.key; - } - ); - - Insert(pq, input); - - auto output = Delete(pq, count); - bool pass = true; - for (int i = 0; i < count; i++) { - if (output[i].key != result[i].key) { - std::cout << "Expected " << result[i].key << " " << result[i].value - << " got " << output[i].key << " " << output[i].value - << std::endl; - pass = false; - } - } - return pass; - } - }, - - {"test_insert_2000_keys", []() { - int num_keys = 2000; - - srand(0); - priority_queue> pq(num_keys); - std::vector std_vec; +TEST_CASE("Single uint32_t elements", "") +{ + priority_queue pq(1); - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } + std::vector els = {1}; - Insert(pq, input); + REQUIRE(test_insertion_and_deletion(pq, els)); - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_10M_keys", []() { - int num_keys = 10e6; - - srand(0); - - priority_queue> pq(num_keys); - - std::vector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - - Insert(pq, input); - - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_10M_keys_warp_level", []() { - int num_keys = 10e6; - const int kNodeSize = 32; - - srand(0); - - priority_queue> pq(num_keys, kNodeSize); - - std::vector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - - Insert(pq, input); - - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys, true); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_10M_keys_max", []() { - int num_keys = 10e6; - - srand(0); - - priority_queue, - thrust::greater>> pq(num_keys); - - std::vector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - - Insert(pq, input); - - std::sort(std_vec.begin(), std_vec.end(), std::greater()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_10M_keys_unbatched", []() { - int num_keys = 10e6; - const int kNodeSize = 1024; - - srand(0); - - priority_queue> pq(num_keys, kNodeSize); - - std::vector std_vec; - - for (int j = 0; j < num_keys; j += kNodeSize) { - IntIntVector input(min(num_keys - j, kNodeSize)); - for (size_t i = 0; i < input.size(); i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - Insert(pq, input); - } - - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1024e4_keys", []() { - int num_keys = 1024e4; - - srand(0); - - priority_queue> pq(num_keys); - - std::vector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - - Insert(pq, input); - - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1024", []() { - int node_size = 1024; - int num_keys = node_size * 2; - - srand(0); - - priority_queue> pq(num_keys); - - std::vector std_vec; - - for (int j = 0; j < num_keys / node_size; j++) { - IntIntVector input(node_size); - for (int i = 0; i < node_size; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } - - Insert(pq, input); - } - - std::sort(std_vec.begin(), std_vec.end()); - - auto result_vec = Delete(pq, num_keys); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_partial_deletion_1", []() { - int node_size = 1024; - int num_nodes_before = 1; - int num_nodes_after = 1; - int num_keys = node_size * num_nodes_before + - node_size * num_nodes_after + 1; - - srand(0); - - priority_queue> pq(num_keys); - - std::vector std_vec; - IntIntVector input; - IntIntVector result_vec; - - for (int i = 0; i < num_nodes_before * node_size; i++) { - int32_t next = rand(); - std_vec.push_back(next); - input.push_back({next, i}); - } - - int32_t partial = rand(); - std_vec.push_back(partial); - input.push_back({partial, 1}); - - for (int i = 0; i < num_nodes_after * node_size; i++) { - int32_t next = rand(); - std_vec.push_back(next); - input.push_back({next, i}); - } - - Insert(pq, input); - - for (auto i : Delete(pq, node_size * num_nodes_before)) { - result_vec.push_back(i); - } - - result_vec.push_back(Delete(pq, 1)[0]); - - for (auto i : Delete(pq, node_size * num_nodes_after)) { - result_vec.push_back(i); - } - - std::sort(std_vec.begin(), std_vec.end()); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - //std::sort(std_vec.begin(), std_vec.end()); - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1024e4_keys_device_API", []() { - int num_keys = 1024e4; - - srand(0); - - priority_queue> pq(num_keys); - - IntIntVector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, i}; - std_vec.push_back({next, i}); - } - - Pair *elements; - cudaMalloc(&elements, sizeof(Pair) * num_keys); - - cudaMemcpy(elements, &std_vec[0], - sizeof(Pair) * num_keys, - cudaMemcpyHostToDevice); - - const int kBlockSize = 512; - const int kNumBlocks = 512; - - std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { - return a.key < b.key; - }); - - DeviceAPIInsert<<>> - (pq.get_mutable_device_view(), elements, num_keys); - - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> - (pq.get_mutable_device_view(), elements, num_keys); - - IntIntVector result_vec(num_keys); - - cudaMemcpy(&result_vec[0], elements, - sizeof(Pair) * num_keys, - cudaMemcpyDeviceToHost); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i].key; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i].key - << " " << std_vec[i].value - << " got " - << result_vec[i].key << " " << result_vec[i].value - << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1000e4_keys_device_API_warp", []() { - int num_keys = 1000e4 + 1; - const int kNodeSize = 64; - - srand(0); - - priority_queue> pq(num_keys, kNodeSize); - - IntIntVector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, i}; - std_vec.push_back({next, i}); - } - - Pair *elements; - cudaMalloc(&elements, sizeof(Pair) * num_keys); - - cudaMemcpy(elements, &std_vec[0], - sizeof(Pair) * num_keys, - cudaMemcpyHostToDevice); - - const int kBlockSize = 512; - const int kNumBlocks = 512; - - std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { - return a.key < b.key; - }); - - DeviceAPIInsertWarp<<>> - (pq.get_mutable_device_view(), elements, num_keys); - - DeviceAPIDeleteWarp<<<1, 32, pq.get_shmem_size(32)>>> - (pq.get_mutable_device_view(), elements, num_keys); - - IntIntVector result_vec(num_keys); - - cudaMemcpy(&result_vec[0], elements, - sizeof(Pair) * num_keys, - cudaMemcpyDeviceToHost); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i].key; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i].key - << " " << std_vec[i].value - << " got " - << result_vec[i].key << " " << - result_vec[i].value << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1000e4_keys_device_API", []() { - int num_keys = 1000e4 + 1; - - srand(0); - - priority_queue> pq(num_keys); - - IntIntVector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, i}; - std_vec.push_back({next, i}); - } - - Pair *elements; - cudaMalloc(&elements, sizeof(Pair) * num_keys); - - cudaMemcpy(elements, &std_vec[0], - sizeof(Pair) * num_keys, - cudaMemcpyHostToDevice); - - const int kBlockSize = 512; - const int kNumBlocks = 512; - - std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { - return a.key < b.key; - }); - - DeviceAPIInsert<<>> - (pq.get_mutable_device_view(), elements, num_keys); - - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> - (pq.get_mutable_device_view(), elements, num_keys); - - IntIntVector result_vec(num_keys); - - cudaMemcpy(&result_vec[0], elements, - sizeof(Pair) * num_keys, - cudaMemcpyDeviceToHost); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i].key; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i].key - << " " << std_vec[i].value - << " got " - << result_vec[i].key << " " << - result_vec[i].value << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_1024e4_keys_device_API_warp", []() { - int num_keys = 1024e4; - const int kNodeSize = 64; - - srand(0); - - priority_queue> pq(num_keys, kNodeSize); - - IntIntVector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, i}; - std_vec.push_back({next, i}); - } - - Pair *elements; - cudaMalloc(&elements, sizeof(Pair) * num_keys); - - cudaMemcpy(elements, &std_vec[0], - sizeof(Pair) * num_keys, - cudaMemcpyHostToDevice); - - const int kBlockSize = 512; - const int kNumBlocks = 512; - - std::sort(std_vec.begin(), std_vec.end(), [](auto a, auto b) { - return a.key < b.key; - }); - - DeviceAPIInsertWarp<<>> - (pq.get_mutable_device_view(), elements, num_keys); - - DeviceAPIDeleteWarp<<<1, 32, pq.get_shmem_size(32)>>> - (pq.get_mutable_device_view(), elements, num_keys); - - IntIntVector result_vec(num_keys); - - cudaMemcpy(&result_vec[0], elements, - sizeof(Pair) * num_keys, - cudaMemcpyDeviceToHost); - - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i].key; - if (result && !next) { - std::cout << i << ": expected " << std_vec[i].key - << " " << std_vec[i].value - << " got " - << result_vec[i].key << " " << - result_vec[i].value << std::endl; - } - result = result && next; - } - - return result; - } - }, - - {"test_insert_10M_thrust_device_vec_iterator", []() { - int num_keys = 10e6; - - srand(0); - - priority_queue> pq(num_keys); - - std::vector std_vec; - - IntIntVector input(num_keys); - for (int i = 0; i < num_keys; i++) { - int32_t next = rand(); - input[i] = {next, 1}; - std_vec.push_back(next); - } +} - thrust::device_vector> d_input(input); +TEMPLATE_TEST_CASE_SIG("10M elements", "", + ((typename T, typename Compare), T, Compare), + (uint32_t, thrust::less), + (uint64_t, thrust::less)) +{ + auto num_keys = 10'000'000; - pq.push(d_input.begin(), d_input.end()); + priority_queue pq(num_keys); - std::sort(std_vec.begin(), std_vec.end()); + std::vector els(num_keys); - auto result_vec = Delete(pq, num_keys); + generate_elements(els.begin(), els.end()); - bool result = true; - for (int i = 0; i < num_keys; i++) { - bool next = result_vec[i].key == std_vec[i]; - if (result && !next) { - std::cout << i << ": " << " expected " << std_vec[i] << " got " - << result_vec[i].key << std::endl; - } - result = result && next; - } + REQUIRE(test_insertion_and_deletion(pq, els)); - return result; - } - }, -}; +} -int main() { +/*int main() { int failures = 0; @@ -1258,4 +104,4 @@ int main() { std::cout << "Failures: " << failures << std::endl; return 0; -} +}*/ From e2235986e101d187277e8853bd07d6ce6a24af1b Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sat, 18 Dec 2021 04:46:33 +0000 Subject: [PATCH 16/55] Prevent block size from being larger than node size --- include/cuco/detail/priority_queue.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 0092c9c91..b3b923379 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -79,7 +79,7 @@ void priority_queue::push(InputIt first, bool warp_level, cudaStream_t stream) { - const int kBlockSize = block_size; + const int kBlockSize = min(block_size, (int)node_size_); const int kNumBlocks = grid_size; if (!warp_level) { @@ -108,7 +108,7 @@ void priority_queue::pop(OutputIt first, bool warp_level, cudaStream_t stream) { - const int kBlockSize = block_size; + const int kBlockSize = min(block_size, (int)node_size_); const int kNumBlocks = grid_size; cudaMemset(d_pop_tracker_, 0, sizeof(int)); From dd8c6b74a2e14a64f1399958048416a7bd91a8cf Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Dec 2021 08:58:59 +0000 Subject: [PATCH 17/55] Continue porting tests to Catch2 --- tests/priority_queue/priority_queue_test.cu | 150 +++++++++++++++----- 1 file changed, 112 insertions(+), 38 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 77e080acb..c64e5ecac 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -5,12 +5,14 @@ #include #include #include -#include +#include +#include #include #include #include +#include #include @@ -20,33 +22,113 @@ using namespace cuco; +template +struct KVPair { + K first; + V second; +}; + +template +bool __host__ __device__ operator==(const KVPair &a, const KVPair &b) { + return a.first == b.first && a.second == b.second; +} + +template +bool __host__ __device__ operator<(const KVPair &a, const KVPair &b) { + if (a.first == b.first) { + return a.second < b.second; + } else { + return a.first < b.first; + } +} + +template +struct KVLess { + __host__ __device__ bool operator()(const T& a, const T& b) const { + return a.first < b.first; + } +}; + +// Insert elements into the queue and check that they are +// all returned when removed from the queue template bool test_insertion_and_deletion(priority_queue &pq, - std::vector &elements) { + std::vector &elements, + size_t n) { + + // Create a device vector containing the input elements + // to put into the queue thrust::device_vector d_elements(elements); pq.push(d_elements.begin(), d_elements.end()); cudaDeviceSynchronize(); - pq.pop(d_elements.begin(), d_elements.end()); + thrust::device_vector d_popped_elements(n); + + pq.pop(d_popped_elements.begin(), d_popped_elements.end()); cudaDeviceSynchronize(); - thrust::host_vector popped_elements(d_elements); + // Create a host vector of the removed elements + thrust::host_vector popped_elements(d_popped_elements); - std::unordered_set popped_element_set(popped_elements.begin(), - popped_elements.end()); + std::sort(elements.begin(), elements.end(), Compare{}); + + // Construct a map with the counts of each element inserted into the queue + std::map inserted_counts; + for (int i = 0; i < n; i++) { + T &e = elements[i]; + if (inserted_counts.find(e) == inserted_counts.end()) { + inserted_counts.emplace(e, 0); + } + + inserted_counts[e]++; + } + + + // Construct a map with the counts of each element removed from the queue + std::map removed_counts; + for (T &e : popped_elements) { + if (removed_counts.find(e) == removed_counts.end()) { + removed_counts.emplace(e, 0); + } + + removed_counts[e]++; + } bool result = true; - for (auto &e : elements) { - result = result && (popped_element_set.find(e) - != popped_element_set.end()); + for (auto &pair : inserted_counts) { + if (removed_counts.find(pair.first) != removed_counts.end()) { + result = result && (removed_counts[pair.first] + == pair.second); + } else { + result = false; + } } return result; } +template +bool test_insertion_and_deletion(priority_queue &pq, + std::vector &elements) { + return test_insertion_and_deletion(pq, elements, elements.size()); +} + + +template +static void generate_element(T &e, std::mt19937 &gen) { + e = static_cast(gen()); +} + +template <> +void generate_element> + (KVPair &e, std::mt19937 &gen) { + generate_element(e.first, gen); + generate_element(e.second, gen); +} + template static void generate_elements(OutputIt output_begin, OutputIt output_end) { auto num_keys = std::distance(output_begin, output_end); @@ -55,13 +137,13 @@ static void generate_elements(OutputIt output_begin, OutputIt output_end) { std::mt19937 gen{rd()}; for (auto i = 0; i < num_keys; i++) { - output_begin[i] = static_cast(gen()); + generate_element(output_begin[i], gen); } } - -TEST_CASE("Single uint32_t elements", "") +TEST_CASE("Single uint32_t element", "") { + priority_queue pq(1); std::vector els = {1}; @@ -70,38 +152,30 @@ TEST_CASE("Single uint32_t elements", "") } -TEMPLATE_TEST_CASE_SIG("10M elements", "", - ((typename T, typename Compare), T, Compare), - (uint32_t, thrust::less), - (uint64_t, thrust::less)) +TEMPLATE_TEST_CASE_SIG("N deletions are correct", "", + ((typename T, typename Compare, size_t N, size_t NumKeys), + T, Compare, N, NumKeys), + (uint32_t, thrust::less, 100, 10'000'000), + (uint64_t, thrust::less, 100, 10'000'000), + (KVPair, KVLess>, + 100, 10'000'000), + (uint32_t, thrust::less, 10'000, 10'000'000), + (uint64_t, thrust::less, 10'000, 10'000'000), + (KVPair, KVLess>, + 10'000, 10'000'000), + (uint32_t, thrust::less, 10'000'000, 10'000'000), + (uint64_t, thrust::less, 10'000'000, 10'000'000), + (KVPair, KVLess>, + 10'000'000, 10'000'000)) { - auto num_keys = 10'000'000; - priority_queue pq(num_keys); + priority_queue pq(NumKeys); - std::vector els(num_keys); + std::vector els(NumKeys); generate_elements(els.begin(), els.end()); - REQUIRE(test_insertion_and_deletion(pq, els)); + REQUIRE(test_insertion_and_deletion(pq, els, N)); } -/*int main() { - - int failures = 0; - - for (auto c : cases) { - std::cout << c.name << "....."; - if (c.func()) { - std::cout << "PASS" << std::endl; - } else { - std::cout << "FAIL" << std::endl; - failures++; - } - } - - std::cout << "Failures: " << failures << std::endl; - - return 0; -}*/ From d03151975deaa098dc5df68eb6e0720c3b667316 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Dec 2021 20:58:51 +0000 Subject: [PATCH 18/55] Make generate_element for KVPair generic --- tests/priority_queue/priority_queue_test.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index c64e5ecac..39d9a389d 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -122,9 +122,9 @@ static void generate_element(T &e, std::mt19937 &gen) { e = static_cast(gen()); } -template <> -void generate_element> - (KVPair &e, std::mt19937 &gen) { +template +void generate_element + (KVPair &e, std::mt19937 &gen) { generate_element(e.first, gen); generate_element(e.second, gen); } From ba3a6fda6640140d3ed41868e412ac8de3f4b6bf Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 26 Dec 2021 03:52:32 +0000 Subject: [PATCH 19/55] Finish Catch2 tests --- include/cuco/detail/priority_queue.inl | 74 ++-- .../cuco/detail/priority_queue_kernels.cuh | 318 ++++++++------ include/cuco/priority_queue.cuh | 16 +- tests/priority_queue/priority_queue_test.cu | 397 +++++++++++++++--- 4 files changed, 577 insertions(+), 228 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index b3b923379..ec737d928 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -46,9 +46,6 @@ priority_queue::priority_queue CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); - d_pop_tracker_ = std::allocator_traits::allocate( - int_allocator_, - 1); } @@ -64,9 +61,6 @@ priority_queue::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, d_locks_, node_capacity_ + 1); - std::allocator_traits::deallocate(int_allocator_, - d_pop_tracker_, - 1); } @@ -74,27 +68,27 @@ template template void priority_queue::push(InputIt first, InputIt last, + cudaStream_t stream, int block_size, int grid_size, - bool warp_level, - cudaStream_t stream) { + bool warp_level) { const int kBlockSize = min(block_size, (int)node_size_); const int kNumBlocks = grid_size; - if (!warp_level) { + //if (!warp_level) { PushKernel<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, compare_); - } else { - PushKernelWarp<<>> - (first, last - first, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, - lowest_level_start_, get_shmem_size(32), compare_); - } + //} else { + // PushKernelWarp<<>> + // (first, last - first, d_heap_, d_size_, + // node_size_, d_locks_, d_p_buffer_size_, + // lowest_level_start_, get_shmem_size(32), compare_); + //} CUCO_CUDA_TRY(cudaGetLastError()); } @@ -103,30 +97,44 @@ template template void priority_queue::pop(OutputIt first, OutputIt last, + cudaStream_t stream, int block_size, int grid_size, - bool warp_level, - cudaStream_t stream) { + bool warp_level) { const int kBlockSize = min(block_size, (int)node_size_); const int kNumBlocks = grid_size; - cudaMemset(d_pop_tracker_, 0, sizeof(int)); - if (!warp_level) { - PopKernel<<>> + (first, partial, d_heap_, d_size_, + node_size_, d_locks_, d_p_buffer_size_, + lowest_level_start_, node_capacity_, compare_); + } + + pop_size -= partial; + first += partial; + + + //if (!warp_level) { + PopKernel<<>> - (first, last - first, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, - d_pop_tracker_, lowest_level_start_, node_capacity_, compare_); - } else { - PopKernelWarp<<>> - (first, last - first, d_heap_, d_size_, + (first, pop_size, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, - d_pop_tracker_, lowest_level_start_, - node_capacity_, get_shmem_size(32), compare_); + lowest_level_start_, node_capacity_, compare_); + //} else { + // PopKernelWarp<<>> + // (first, last - first, d_heap_, d_size_, + // node_size_, d_locks_, d_p_buffer_size_, + // lowest_level_start_, + // node_capacity_, get_shmem_size(32), compare_); - } + //} CUCO_CUDA_TRY(cudaGetLastError()); } @@ -162,15 +170,13 @@ __device__ void priority_queue OutputIt first, OutputIt last, void *temp_storage) { - int pop_tracker = 0; - SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); if (last - first == node_size_) { PopSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, - d_p_buffer_size_, &pop_tracker, lowest_level_start_, + d_p_buffer_size_, lowest_level_start_, node_capacity_, shmem, compare_); } else { PopPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index aa2bbc26f..971917be1 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -741,8 +741,6 @@ __device__ void PushSingleNode(CG const& g, * @param node_size Size of the nodes in the heap * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer -* @param pop_tracker The pop tracker for this concurrent pop operation -* (see PopKernel) * @param lowest_level_start Index of the first node in the last level of the * heap * @param node_capacity Maximum capacity of the heap in nodes @@ -756,7 +754,6 @@ __device__ void PopSingleNode(CG const& g, size_t node_size, int *locks, size_t *p_buffer_size, - int *pop_tracker, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, @@ -766,6 +763,15 @@ __device__ void PopSingleNode(CG const& g, int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); + if (*size == 0) { + CopyPairs(g, elements, heap, node_size); + + if (lane == 0) { + *p_buffer_size = 0; + } + g.sync(); + return; + } // Find the target node (the last one inserted) and // decrement the size @@ -776,21 +782,16 @@ __device__ void PopSingleNode(CG const& g, AcquireLock(g, &locks[tar]); } - // pop_tracker determines our location in the output array, - // since it tells us how many other nodes have been previously been - // extracted by this block or by other blocks - int out_idx = *pop_tracker; g.sync(); if (lane == 0) { *size -= 1; - *pop_tracker += 1; } g.sync(); // Copy the root to the output array - CopyPairs(g, elements + out_idx * node_size, &heap[node_size], + CopyPairs(g, elements, &heap[node_size], &heap[node_size] + node_size); g.sync(); @@ -864,6 +865,10 @@ __device__ void PopPartialNode(CG const& g, int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); + //if (lane == 0) { + // printf("PopPartialNode lock acquired\n", *size); + // printf("Size: %d\n", *size); + //} if (*size == 0) { CopyPairs(g, elements, heap, num_elements); @@ -906,8 +911,15 @@ __device__ void PopPartialNode(CG const& g, shmem, compare); + g.sync(); + if (lane == 0) { *p_buffer_size = *p_buffer_size - num_elements; + //printf("size: %d\n", *size); + //for (int i = 0; i < node_size; i++) { + // printf("%d ", heap[kPBufferIdx + i]); + //} + //printf("\n"); } g.sync(); @@ -939,8 +951,18 @@ __device__ void PopPartialNode(CG const& g, int tar = InsertionOrderIndex(*size, lowest_level_start); g.sync(); + *p_buffer_size += node_size; + *p_buffer_size -= num_elements; + + g.sync(); + if (lane == 0) { *size -= 1; + //printf("size: %d\n", *size); + //for (int i = 0; i < node_size; i++) { + // printf("%d ", heap[kPBufferIdx + i]); + //} + //printf("\n"); } if (tar != kRootIdx) { @@ -962,8 +984,22 @@ __device__ void PopPartialNode(CG const& g, node_size, shmem, compare); - g.sync(); + //if (lane == 0) { + // printf("shmem A:\n"); + // for (int i = 0; i < node_size; i++) { + // printf("%lu ", shmem.A[i]); + // } + // printf("\n"); + // printf("shmem B:\n"); + // for (int i = 0; i < node_size; i++) { + // printf("%lu ", shmem.B[i]); + // } + // printf("\n"); + // printf("p_buffer_size: %d\n", *p_buffer_size); + //} + + //g.sync(); CopyPairs(g, &heap[node_size], shmem.A, node_size); @@ -971,6 +1007,20 @@ __device__ void PopPartialNode(CG const& g, g.sync(); + //if (lane == 0) { + // printf("size: %d\n", *size); + // for (int i = 0; i < node_size; i++) { + // printf("%lu ", heap[kPBufferIdx + i]); + // } + // printf("\n"); + // for (int i = 0; i < node_size; i++) { + // printf("%lu ", heap[node_size + i]); + // } + // printf("\n"); + //} + + //g.sync(); + Sink(g, heap, size, node_size, locks, p_buffer_size, lowest_level_start, node_capacity, shmem, compare); @@ -1182,55 +1232,55 @@ __global__ void PushKernel(OutputIt elements, * @param temp_node A temporary array large enough to store sizeof(T) * node_size bytes */ -template -__global__ void PushKernelWarp(InputIt elements, - size_t num_elements, - T *heap, - int *size, - size_t node_size, - int *locks, - size_t *p_buffer_size, - int lowest_level_start, - int bytes_shmem_per_warp, - Compare compare) { - - extern __shared__ char sh[]; - - // We push as many elements as possible as full nodes, - // then deal with the remaining elements as a partial insertion - // below - thread_block block = this_thread_block(); - thread_block_tile<32> warp = tiled_partition<32>(block); - - SharedMemoryLayout shmem = GetSharedMemoryLayout( - (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), - 32, node_size); - - for (size_t i = warp.meta_group_rank() * node_size - + blockIdx.x * node_size * (blockDim.x / 32); - i + node_size <= num_elements; - i += (blockDim.x / 32) * node_size * gridDim.x) { - PushSingleNode(warp, elements + i, heap, size, node_size, locks, - lowest_level_start, shmem, compare); - } - - // We only need one block for partial insertion - if (blockIdx.x != 0 || warp.meta_group_rank() != 0) { - return; - } - - // If node_size does not divide num_elements, there are some leftover - // elements for which we must perform a partial insertion - size_t first_not_inserted = (num_elements / node_size) - * node_size; - - if (first_not_inserted < num_elements) { - size_t p_ins_size = num_elements - first_not_inserted; - PushPartialNode(warp, elements + first_not_inserted, p_ins_size, - heap, size, node_size, locks, p_buffer_size, - lowest_level_start, shmem, compare); - } -} +//template +//__global__ void PushKernelWarp(InputIt elements, +// size_t num_elements, +// T *heap, +// int *size, +// size_t node_size, +// int *locks, +// size_t *p_buffer_size, +// int lowest_level_start, +// int bytes_shmem_per_warp, +// Compare compare) { +// +// extern __shared__ char sh[]; +// +// // We push as many elements as possible as full nodes, +// // then deal with the remaining elements as a partial insertion +// // below +// thread_block block = this_thread_block(); +// thread_block_tile<32> warp = tiled_partition<32>(block); +// +// SharedMemoryLayout shmem = GetSharedMemoryLayout( +// (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), +// 32, node_size); +// +// for (size_t i = warp.meta_group_rank() * node_size +// + blockIdx.x * node_size * (blockDim.x / 32); +// i + node_size <= num_elements; +// i += (blockDim.x / 32) * node_size * gridDim.x) { +// PushSingleNode(warp, elements + i, heap, size, node_size, locks, +// lowest_level_start, shmem, compare); +// } +// +// // We only need one block for partial insertion +// if (blockIdx.x != 0 || warp.meta_group_rank() != 0) { +// return; +// } +// +// // If node_size does not divide num_elements, there are some leftover +// // elements for which we must perform a partial insertion +// size_t first_not_inserted = (num_elements / node_size) +// * node_size; +// +// if (first_not_inserted < num_elements) { +// size_t p_ins_size = num_elements - first_not_inserted; +// PushPartialNode(warp, elements + first_not_inserted, p_ins_size, +// heap, size, node_size, locks, p_buffer_size, +// lowest_level_start, shmem, compare); +// } +//} /** * Remove exactly node_size elements from the heap and place them @@ -1242,68 +1292,86 @@ __global__ void PushKernelWarp(InputIt elements, * @param node_size Size of the nodes in the heap * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer -* @param pop_tracker Pointer to an integer in global memory initialized to 0 */ +//template +//__global__ void PopKernelWarp(OutputIt elements, +// size_t num_elements, +// T *heap, +// int *size, +// size_t node_size, +// int *locks, +// size_t *p_buffer_size, +// int lowest_level_start, +// int node_capacity, +// int bytes_shmem_per_warp, +// Compare compare) { +// +// extern __shared__ char sh[]; +// +// thread_block block = this_thread_block(); +// thread_block_tile<32> warp = tiled_partition<32>(block); +// +// SharedMemoryLayout shmem = GetSharedMemoryLayout( +// (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), +// 32, node_size); +// +// for (size_t i = warp.meta_group_rank() + (blockDim.x / 32) * blockIdx.x; +// i < num_elements / node_size; +// i += gridDim.x * blockDim.x / 32) { +// PopSingleNode(warp, elements, heap, size, node_size, locks, +// p_buffer_size, lowest_level_start, +// node_capacity, shmem, compare); +// } +// +// AcquireLock(warp, &locks[kRootIdx]); +// // Remove from the partial buffer if there are no nodes +// // Only one thread will attempt this deletion because we have acquired +// // the root and will increment pop_tracker once we begin the deletion +// if (*pop_tracker == num_elements / node_size +// && num_elements % node_size != 0) { +// +// if (warp.thread_rank() == 0) { +// *pop_tracker += 1; +// } +// +// size_t p_del_size = num_elements % node_size; +// +// ReleaseLock(warp, &locks[kRootIdx]); +// +// PopPartialNode(warp, +// elements + (num_elements / node_size) * node_size, +// p_del_size, heap, size, node_size, locks, p_buffer_size, +// lowest_level_start, node_capacity, shmem, compare); +// +// } else { +// ReleaseLock(warp, &locks[kRootIdx]); +// } +//} + template -__global__ void PopKernelWarp(OutputIt elements, +__global__ void PopPartialNodeKernel(OutputIt elements, size_t num_elements, T *heap, int *size, size_t node_size, int *locks, size_t *p_buffer_size, - int *pop_tracker, int lowest_level_start, int node_capacity, - int bytes_shmem_per_warp, Compare compare) { + extern __shared__ int s[]; - // We use pop_tracker to ensure that each thread block inserts its node - // at the correct location in the output array - // Since we do not know which block will extract which node - - extern __shared__ char sh[]; - - thread_block block = this_thread_block(); - thread_block_tile<32> warp = tiled_partition<32>(block); - - SharedMemoryLayout shmem = GetSharedMemoryLayout( - (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), - 32, node_size); - - for (size_t i = warp.meta_group_rank() + (blockDim.x / 32) * blockIdx.x; - i < num_elements / node_size; - i += gridDim.x * blockDim.x / 32) { - PopSingleNode(warp, elements, heap, size, node_size, locks, - p_buffer_size, pop_tracker, lowest_level_start, - node_capacity, shmem, compare); - } - - AcquireLock(warp, &locks[kRootIdx]); - // Remove from the partial buffer if there are no nodes - // Only one thread will attempt this deletion because we have acquired - // the root and will increment pop_tracker once we begin the deletion - if (*pop_tracker == num_elements / node_size - && num_elements % node_size != 0) { - - if (warp.thread_rank() == 0) { - *pop_tracker += 1; - } - - size_t p_del_size = num_elements % node_size; - - ReleaseLock(warp, &locks[kRootIdx]); + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, + blockDim.x, node_size); - PopPartialNode(warp, - elements + (num_elements / node_size) * node_size, - p_del_size, heap, size, node_size, locks, p_buffer_size, + thread_block g = this_thread_block(); + PopPartialNode(g, elements, + num_elements, heap, size, node_size, locks, p_buffer_size, lowest_level_start, node_capacity, shmem, compare); - - } else { - ReleaseLock(warp, &locks[kRootIdx]); - } + } + /** * Remove exactly node_size elements from the heap and place them * in elements @@ -1314,7 +1382,6 @@ __global__ void PopKernelWarp(OutputIt elements, * @param node_size Size of the nodes in the heap * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer -* @param pop_tracker Pointer to an integer in global memory initialized to 0 */ template __global__ void PopKernel(OutputIt elements, @@ -1324,15 +1391,10 @@ __global__ void PopKernel(OutputIt elements, size_t node_size, int *locks, size_t *p_buffer_size, - int *pop_tracker, int lowest_level_start, int node_capacity, Compare compare) { - // We use pop_tracker to ensure that each thread block inserts its node - // at the correct location in the output array - // Since we do not know which block will extract which node - extern __shared__ int s[]; SharedMemoryLayout shmem = GetSharedMemoryLayout(s, @@ -1340,32 +1402,32 @@ __global__ void PopKernel(OutputIt elements, thread_block g = this_thread_block(); for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { - PopSingleNode(g, elements, heap, size, node_size, locks, - p_buffer_size, pop_tracker, lowest_level_start, + PopSingleNode(g, elements + i * node_size, heap, size, node_size, locks, + p_buffer_size, lowest_level_start, node_capacity, shmem, compare); } - AcquireLock(g, &locks[kRootIdx]); + //AcquireLock(g, &locks[kRootIdx]); // Remove from the partial buffer if there are no nodes // Only one thread will attempt this deletion because we have acquired // the root and will increment pop_tracker once we begin the deletion - if (*pop_tracker == num_elements / node_size - && num_elements % node_size != 0) { + //if (*pop_tracker == num_elements / node_size + // && num_elements % node_size != 0) { - if (g.thread_rank() == 0) { - *pop_tracker += 1; - } + // if (g.thread_rank() == 0) { + // *pop_tracker += 1; + // } - size_t p_del_size = num_elements % node_size; + // size_t p_del_size = num_elements % node_size; - ReleaseLock(g, &locks[kRootIdx]); + // ReleaseLock(g, &locks[kRootIdx]); - PopPartialNode(g, elements + (num_elements / node_size) * node_size, - p_del_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem, compare); - - } else { - ReleaseLock(g, &locks[kRootIdx]); - } + // PopPartialNode(g, elements + (num_elements / node_size) * node_size, + // p_del_size, heap, size, node_size, locks, p_buffer_size, + // lowest_level_start, node_capacity, shmem, compare); + // + //} else { + // ReleaseLock(g, &locks[kRootIdx]); + //} } } diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 3f4a3cb5a..a9a8346ba 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -87,10 +87,9 @@ class priority_queue { * run */ template - void push(InputIt first, InputIt last, + void push(InputIt first, InputIt last, cudaStream_t stream = 0, int block_size = 256, int grid_size = 64000, - bool warp_level = false, - cudaStream_t stream = 0); + bool warp_level = false); /** * @brief Remove a sequence of the lowest (when Max == false) or the @@ -109,10 +108,9 @@ class priority_queue { * run */ template - void pop(OutputIt first, OutputIt last, + void pop(OutputIt first, OutputIt last, cudaStream_t stream = 0, int block_size = 512, int grid_size = 32000, - bool warp_level = false, - cudaStream_t stream = 0); + bool warp_level = false); /* * @brief Return the amount of shared memory required for operations on the queue @@ -224,7 +222,7 @@ class priority_queue { }; /* - * @brief Returns a trivailly-copyable class that can be used to perform + * @brief Returns a trivially-copyable class that can be used to perform * insertion and deletion of single nodes in device code with * cooperative groups * @@ -252,9 +250,7 @@ class priority_queue { int *d_locks_; ///< Array of locks where `d_locks_[i]` is the /// lock for the node starting at /// 1d_heap_[node_size * i]` - int *d_pop_tracker_; ///< Variable used to track where in its output - /// array a pop operation should place a given - /// popped node + Allocator allocator_; int_allocator_type int_allocator_; t_allocator_type t_allocator_; diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 39d9a389d..54b40cc11 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -49,73 +49,46 @@ struct KVLess { } }; -// Insert elements into the queue and check that they are -// all returned when removed from the queue -template -bool test_insertion_and_deletion(priority_queue &pq, - std::vector &elements, - size_t n) { - - // Create a device vector containing the input elements - // to put into the queue - thrust::device_vector d_elements(elements); - - pq.push(d_elements.begin(), d_elements.end()); - - cudaDeviceSynchronize(); - - thrust::device_vector d_popped_elements(n); - - pq.pop(d_popped_elements.begin(), d_popped_elements.end()); - - cudaDeviceSynchronize(); - - // Create a host vector of the removed elements - thrust::host_vector popped_elements(d_popped_elements); +template +std::map construct_count_map(std::vector &a) { - std::sort(elements.begin(), elements.end(), Compare{}); + std::map result; - // Construct a map with the counts of each element inserted into the queue - std::map inserted_counts; - for (int i = 0; i < n; i++) { - T &e = elements[i]; - if (inserted_counts.find(e) == inserted_counts.end()) { - inserted_counts.emplace(e, 0); + for (T &e : a) { + if (result.find(e) == result.end()) { + result.emplace(e, 0); } - inserted_counts[e]++; + result[e]++; } + return result; +} - // Construct a map with the counts of each element removed from the queue - std::map removed_counts; - for (T &e : popped_elements) { - if (removed_counts.find(e) == removed_counts.end()) { - removed_counts.emplace(e, 0); - } - - removed_counts[e]++; - } +template +bool count_maps_are_equal(std::map &a, std::map &b) { bool result = true; - for (auto &pair : inserted_counts) { - if (removed_counts.find(pair.first) != removed_counts.end()) { - result = result && (removed_counts[pair.first] + for (auto &pair : a) { + if (b.find(pair.first) != b.end()) { + result = result && (b[pair.first] == pair.second); } else { - result = false; + return false; } } return result; -} -template -bool test_insertion_and_deletion(priority_queue &pq, - std::vector &elements) { - return test_insertion_and_deletion(pq, elements, elements.size()); } +template +bool has_same_elements(std::vector &a, std::vector &b) { + auto map_a = construct_count_map(a); + auto map_b = construct_count_map(b); + + return count_maps_are_equal(map_a, map_b); +} template static void generate_element(T &e, std::mt19937 &gen) { @@ -129,18 +102,73 @@ void generate_element generate_element(e.second, gen); } -template -static void generate_elements(OutputIt output_begin, OutputIt output_end) { - auto num_keys = std::distance(output_begin, output_end); +template +static std::vector generate_elements(size_t num_keys) { std::random_device rd; std::mt19937 gen{rd()}; + std::vector result(num_keys); + for (auto i = 0; i < num_keys; i++) { - generate_element(output_begin[i], gen); + generate_element(result[i], gen); } + + return result; } +template +static void insert_to_queue(priority_queue &pq, std::vector &v) { + thrust::device_vector d_v(v); + + pq.push(d_v.begin(), d_v.end()); + + cudaDeviceSynchronize(); +} + +template +static std::vector pop_from_queue(priority_queue &pq, size_t n) { + + thrust::device_vector d_popped(n); + + pq.pop(d_popped.begin(), d_popped.end()); + + cudaDeviceSynchronize(); + + thrust::host_vector h_popped(d_popped); + + return std::vector(h_popped.begin(), h_popped.end()); + +} + +// Insert elements into the queue and check that they are +// all returned when removed from the queue +template +bool test_insertion_and_deletion(priority_queue &pq, + std::vector &elements, + size_t n) { + + insert_to_queue(pq, elements); + + auto popped_elements = pop_from_queue(pq, n); + + std::sort(elements.begin(), elements.end(), Compare{}); + + std::vector correct_popped_elements(elements.begin(), elements.begin() + n); + + return has_same_elements(correct_popped_elements, popped_elements); + +} + +template +bool test_insertion_and_deletion(priority_queue &pq, + std::vector &elements) { + return test_insertion_and_deletion(pq, elements, elements.size()); +} + + + + TEST_CASE("Single uint32_t element", "") { @@ -152,6 +180,262 @@ TEST_CASE("Single uint32_t element", "") } +TEST_CASE("New node created on partial insertion") +{ + + const size_t kInsertionSize = 600; + const size_t kNumElements = kInsertionSize * 2; + + priority_queue pq(kNumElements); + + std::vector els = generate_elements(kNumElements); + + std::vector first_insertion(els.begin(), + els.begin() + kInsertionSize); + + std::vector second_insertion(els.begin() + kInsertionSize, + els.end()); + + insert_to_queue(pq, first_insertion); + + insert_to_queue(pq, second_insertion); + + auto popped_elements = pop_from_queue(pq, kInsertionSize); + + std::sort(els.begin(), els.end()); + + std::vector correct_popped_elements(els.begin(), + els.begin() + kInsertionSize); + + REQUIRE(has_same_elements(popped_elements, correct_popped_elements)); + +} + +TEST_CASE("Insert, delete, insert, delete", "") { + const size_t kFirstInsertionSize = 100'000; + const size_t kFirstDeletionSize = 10'000; + const size_t kSecondInsertionSize = 20'000; + const size_t kSecondDeletionSize = 50'000; + using T = uint32_t; + using Compare = thrust::less; + + priority_queue pq(kFirstInsertionSize + kSecondInsertionSize); + + auto first_insertion_els = generate_elements(kFirstInsertionSize); + + auto second_insertion_els = generate_elements(kSecondInsertionSize); + + insert_to_queue(pq, first_insertion_els); + + auto first_popped_elements = pop_from_queue(pq, kFirstDeletionSize); + + insert_to_queue(pq, second_insertion_els); + + auto second_popped_elements = pop_from_queue(pq, kSecondDeletionSize); + + std::vector correct_first_deletion; + + std::sort(first_insertion_els.begin(), first_insertion_els.end(), Compare{}); + + correct_first_deletion.insert(correct_first_deletion.end(), + first_insertion_els.begin(), + first_insertion_els.begin() + kFirstDeletionSize); + + std::vector remaining_elements; + + remaining_elements.insert(remaining_elements.end(), + first_insertion_els.begin() + kFirstDeletionSize, + first_insertion_els.end()); + + remaining_elements.insert(remaining_elements.end(), + second_insertion_els.begin(), + second_insertion_els.end()); + + std::sort(remaining_elements.begin(), remaining_elements.end(), Compare{}); + + std::vector correct_second_deletion; + + correct_second_deletion.insert(correct_second_deletion.end(), + remaining_elements.begin(), + remaining_elements.begin() + kSecondDeletionSize); + + REQUIRE((has_same_elements(correct_first_deletion, first_popped_elements) && + has_same_elements(correct_second_deletion, second_popped_elements))); + + +} + +TEST_CASE("Insertion and deletion on different streams", "") +{ + const size_t kInsertionSize = 100'000; + const size_t kDeletionSize = 10'000; + using T = uint32_t; + using Compare = thrust::less; + + auto elements = generate_elements(kInsertionSize * 2); + thrust::device_vector insertion1(elements.begin(), + elements.begin() + kInsertionSize); + thrust::device_vector insertion2(elements.begin() + kInsertionSize, + elements.end()); + + priority_queue pq(kInsertionSize * 2); + + cudaStream_t stream1, stream2; + + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + + pq.push(insertion1.begin(), insertion1.end(), stream1); + pq.push(insertion2.begin(), insertion2.end(), stream2); + + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); + + thrust::device_vector deletion1(kDeletionSize); + thrust::device_vector deletion2(kDeletionSize); + + pq.pop(deletion1.begin(), deletion1.end(), stream1); + pq.pop(deletion2.begin(), deletion2.end(), stream2); + + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); + + thrust::host_vector h_deletion1(deletion1); + thrust::host_vector h_deletion2(deletion2); + + std::vector popped_elements(h_deletion1.begin(), h_deletion1.end()); + + popped_elements.insert(popped_elements.end(), h_deletion2.begin(), + h_deletion2.end()); + + std::sort(elements.begin(), elements.end(), Compare{}); + + std::vector expected_popped_elements(elements.begin(), + elements.begin() + kDeletionSize * 2); + + REQUIRE(has_same_elements(popped_elements, expected_popped_elements)); + + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); +} + +template +__global__ void DeviceAPIInsert( + View view, + InputIt begin, + InputIt end) { + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + view.push(g, begin, end, shmem); +} + +template +__global__ void DeviceAPIDelete( + View view, + OutputIt begin, + OutputIt end) { + + extern __shared__ int shmem[]; + thread_block g = this_thread_block(); + view.pop(g, begin, end, shmem); +} + +TEST_CASE("Insertion and deletion with Device API", "") +{ + const size_t kInsertionSize = 1000; + const size_t kDeletionSize = 500; + using T = uint32_t; + using Compare = thrust::less; + + auto els = generate_elements(kInsertionSize); + + thrust::device_vector d_els(els); + + priority_queue pq(kInsertionSize); + + const int kBlockSize = 32; + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> + (pq.get_mutable_device_view(), d_els.begin(), d_els.end()); + + cudaDeviceSynchronize(); + + thrust::device_vector d_pop_result(kDeletionSize); + + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> + (pq.get_mutable_device_view(), d_pop_result.begin(), + d_pop_result.end()); + + cudaDeviceSynchronize(); + + thrust::host_vector h_pop_result(d_pop_result); + std::vector pop_result(h_pop_result.begin(), + h_pop_result.end()); + + std::sort(els.begin(), els.end(), Compare{}); + + std::vector expected_pop_result(els.begin(), els.begin() + kDeletionSize); + + REQUIRE(has_same_elements(pop_result, expected_pop_result)); +} + +TEST_CASE("Concurrent insertion and deletion with Device API", "") +{ + const size_t kInsertionSize = 1000; + const size_t kDeletionSize = 500; + const int kBlockSize = 32; + using T = uint32_t; + using Compare = thrust::less; + + auto els = generate_elements(kInsertionSize * 2); + + thrust::device_vector insertion1(els.begin(), els.begin() + kInsertionSize); + thrust::device_vector insertion2(els.begin() + kInsertionSize, els.end()); + + priority_queue pq(kInsertionSize * 2); + + cudaStream_t stream1, stream2; + + cudaStreamCreate(&stream1); + cudaStreamCreate(&stream2); + + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>> + (pq.get_mutable_device_view(), insertion1.begin(), insertion1.end()); + + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>> + (pq.get_mutable_device_view(), insertion2.begin(), insertion2.end()); + + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); + + thrust::device_vector d_deletion1(kDeletionSize); + thrust::device_vector d_deletion2(kDeletionSize); + + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>> + (pq.get_mutable_device_view(), d_deletion1.begin(), d_deletion1.end()); + + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>> + (pq.get_mutable_device_view(), d_deletion2.begin(), d_deletion2.end()); + + cudaStreamSynchronize(stream1); + cudaStreamSynchronize(stream2); + + thrust::host_vector h_deletion1(d_deletion1); + thrust::host_vector h_deletion2(d_deletion2); + + std::vector result(h_deletion1.begin(), h_deletion1.end()); + result.insert(result.end(), h_deletion2.begin(), h_deletion2.end()); + + std::sort(els.begin(), els.end(), Compare{}); + + std::vector expected(els.begin(), els.begin() + kDeletionSize*2); + + REQUIRE(has_same_elements(result, expected)); + + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + +} + TEMPLATE_TEST_CASE_SIG("N deletions are correct", "", ((typename T, typename Compare, size_t N, size_t NumKeys), T, Compare, N, NumKeys), @@ -161,8 +445,11 @@ TEMPLATE_TEST_CASE_SIG("N deletions are correct", "", 100, 10'000'000), (uint32_t, thrust::less, 10'000, 10'000'000), (uint64_t, thrust::less, 10'000, 10'000'000), + (uint64_t, thrust::greater, 10'000, 10'000'000), (KVPair, KVLess>, 10'000, 10'000'000), + (KVPair, KVLess>, + 10'000, 10'000'000), (uint32_t, thrust::less, 10'000'000, 10'000'000), (uint64_t, thrust::less, 10'000'000, 10'000'000), (KVPair, KVLess>, @@ -171,9 +458,7 @@ TEMPLATE_TEST_CASE_SIG("N deletions are correct", "", priority_queue pq(NumKeys); - std::vector els(NumKeys); - - generate_elements(els.begin(), els.end()); + auto els = generate_elements(NumKeys); REQUIRE(test_insertion_and_deletion(pq, els, N)); From 16db085c2c9eaebce67ac7a9e514f6703c0122ea Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 26 Dec 2021 06:09:02 +0000 Subject: [PATCH 20/55] Hide kernel launch details --- .../priority_queue/priority_queue_bench.cu | 28 +++--- include/cuco/detail/priority_queue.inl | 87 ++++++++----------- include/cuco/priority_queue.cuh | 14 ++- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 75ac1a8c5..a7459a26e 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -30,13 +30,15 @@ static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { } } -template +template static void BM_insert(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NumKeys, NodeSize); + priority_queue, pair_less>, + FavorInsertionPerformance> pq(NumKeys); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -49,13 +51,15 @@ static void BM_insert(::benchmark::State& state) } -template +template static void BM_delete(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>> pq(NumKeys, NodeSize); + priority_queue, pair_less>, + FavorInsertionPerformance> pq(NumKeys); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -71,26 +75,26 @@ static void BM_delete(::benchmark::State& state) } -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, 1024) +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, 1024) +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, 1024) +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, 1024) +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, 64) +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, 64) +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, 64) +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true) ->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, 64) +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true) ->Unit(benchmark::kMillisecond); diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index ec737d928..873560c34 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -6,17 +6,18 @@ namespace cuco { -template -priority_queue::priority_queue +template +priority_queue::priority_queue (size_t initial_capacity, - size_t node_size, Allocator const& allocator) : allocator_{allocator}, int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { - node_size_ = node_size; + node_size_ = NodeSize; // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); @@ -49,8 +50,10 @@ priority_queue::priority_queue } -template -priority_queue::~priority_queue() { +template +priority_queue::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, d_size_, 1); std::allocator_traits::deallocate(size_t_allocator_, @@ -64,49 +67,40 @@ priority_queue::~priority_queue() { } -template +template template -void priority_queue::push(InputIt first, +void priority_queue::push(InputIt first, InputIt last, - cudaStream_t stream, - int block_size, - int grid_size, - bool warp_level) { + cudaStream_t stream) { - const int kBlockSize = min(block_size, (int)node_size_); - const int kNumBlocks = grid_size; + const int kBlockSize = min(256, (int)node_size_); + const int kNumBlocks = min(64000, + max(1, (int)((last - first) / node_size_))); - //if (!warp_level) { - PushKernel<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, compare_); - //} else { - // PushKernelWarp<<>> - // (first, last - first, d_heap_, d_size_, - // node_size_, d_locks_, d_p_buffer_size_, - // lowest_level_start_, get_shmem_size(32), compare_); - //} CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, +void priority_queue::pop(OutputIt first, OutputIt last, - cudaStream_t stream, - int block_size, - int grid_size, - bool warp_level) { + cudaStream_t stream) { - const int kBlockSize = min(block_size, (int)node_size_); - const int kNumBlocks = grid_size; + int pop_size = last - first; + const int partial = pop_size % node_size_; - auto pop_size = last - first; - const auto partial = pop_size % node_size_; + const int kBlockSize = min(256, (int)node_size_); + const int kNumBlocks = min(64000, (int)((pop_size - partial) / node_size_)); if (partial != 0) { PopPartialNodeKernel<<<1, kBlockSize, get_shmem_size(kBlockSize), @@ -118,30 +112,23 @@ void priority_queue::pop(OutputIt first, pop_size -= partial; first += partial; - - //if (!warp_level) { - PopKernel<< 0) { + PopKernel<<>> (first, pop_size, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, node_capacity_, compare_); - //} else { - // PopKernelWarp<<>> - // (first, last - first, d_heap_, d_size_, - // node_size_, d_locks_, d_p_buffer_size_, - // lowest_level_start_, - // node_capacity_, get_shmem_size(32), compare_); - - //} + } CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::push( CG const& g, InputIt first, @@ -162,9 +149,11 @@ __device__ void priority_queue } } -template +template template -__device__ void priority_queue +__device__ void priority_queue ::device_mutable_view::pop( CG const& g, OutputIt first, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index a9a8346ba..9fac3276f 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -48,6 +48,7 @@ namespace cuco { * with the largest keys */ template , + bool FavorInsertionPerformance = false, typename Allocator = cuco::cuda_allocator> class priority_queue { @@ -60,6 +61,8 @@ class priority_queue { using size_t_allocator_type = typename std::allocator_traits ::rebind_alloc; + const int NodeSize = FavorInsertionPerformance ? 64 : 1024; + public: /** * @brief Construct a priority queue @@ -68,8 +71,7 @@ class priority_queue { * @param node_size The size of the nodes in the underlying heap data * structure */ - priority_queue(size_t initial_capacity, size_t node_size = 1024, - Allocator const& alloc = Allocator{}); + priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}); /** * @brief Push elements into the priority queue @@ -87,9 +89,7 @@ class priority_queue { * run */ template - void push(InputIt first, InputIt last, cudaStream_t stream = 0, - int block_size = 256, int grid_size = 64000, - bool warp_level = false); + void push(InputIt first, InputIt last, cudaStream_t stream = 0); /** * @brief Remove a sequence of the lowest (when Max == false) or the @@ -108,9 +108,7 @@ class priority_queue { * run */ template - void pop(OutputIt first, OutputIt last, cudaStream_t stream = 0, - int block_size = 512, int grid_size = 32000, - bool warp_level = false); + void pop(OutputIt first, OutputIt last, cudaStream_t stream = 0); /* * @brief Return the amount of shared memory required for operations on the queue From 052cec096afb04535705bbb15fe38b64a4c124ce Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 05:49:06 +0000 Subject: [PATCH 21/55] Clean up partial deletion code --- include/cuco/detail/priority_queue_kernels.cuh | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 971917be1..194b1eeaa 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -876,22 +876,16 @@ __device__ void PopPartialNode(CG const& g, size_t n_p_buffer_size = *p_buffer_size - num_elements; - if (n_p_buffer_size > 0) { - size_t remaining = n_p_buffer_size; - size_t index = 0; - while (remaining > 0) { - size_t this_round = min(remaining, num_elements); - CopyPairs(g, heap + index, heap + index + num_elements, - this_round); - remaining -= this_round; - index += this_round; - g.sync(); - } - } + CopyPairs(g, shmem.A, heap + num_elements, n_p_buffer_size); + + g.sync(); + + CopyPairs(g, heap, shmem.A, n_p_buffer_size); if (lane == 0) { *p_buffer_size = n_p_buffer_size; } + ReleaseLock(g, &locks[kRootIdx]); } else { From a11bea508b3e7b1344b85b38adf255a0d48b6f16 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 08:05:30 +0000 Subject: [PATCH 22/55] Correct test comparisons --- tests/priority_queue/priority_queue_test.cu | 103 +++++++------------- 1 file changed, 34 insertions(+), 69 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 54b40cc11..3cf5a5946 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -65,29 +65,36 @@ std::map construct_count_map(std::vector &a) { return result; } -template -bool count_maps_are_equal(std::map &a, std::map &b) { - - bool result = true; - for (auto &pair : a) { - if (b.find(pair.first) != b.end()) { - result = result && (b[pair.first] - == pair.second); - } else { +template +bool is_valid_top_n(std::vector &top_n, std::vector &elements) { + auto top_n_map = construct_count_map(top_n); + auto elements_map = construct_count_map(elements); + + size_t n = top_n.size(); + + // 1. Check that the count of each element in the top n is less than or + // equal to the count of that element overall in the queue + for (auto &pair : top_n_map) { + if (elements_map.find(pair.first) == elements_map.end() + || elements_map[pair.first] < pair.second) { return false; } } - return result; + // 2. Check that each element in the top N is not ordered + // after the (n - 1)th element of the sorted list of elements + std::sort(elements.begin(), elements.end(), Compare{}); -} + T max = elements[n - 1]; -template -bool has_same_elements(std::vector &a, std::vector &b) { - auto map_a = construct_count_map(a); - auto map_b = construct_count_map(b); + for (T &e : top_n) { + if (Compare{}(max, e)) { + return false; + } + } + + return true; - return count_maps_are_equal(map_a, map_b); } template @@ -152,23 +159,10 @@ bool test_insertion_and_deletion(priority_queue &pq, auto popped_elements = pop_from_queue(pq, n); - std::sort(elements.begin(), elements.end(), Compare{}); - - std::vector correct_popped_elements(elements.begin(), elements.begin() + n); - - return has_same_elements(correct_popped_elements, popped_elements); + return is_valid_top_n(popped_elements, elements); } -template -bool test_insertion_and_deletion(priority_queue &pq, - std::vector &elements) { - return test_insertion_and_deletion(pq, elements, elements.size()); -} - - - - TEST_CASE("Single uint32_t element", "") { @@ -176,7 +170,7 @@ TEST_CASE("Single uint32_t element", "") std::vector els = {1}; - REQUIRE(test_insertion_and_deletion(pq, els)); + REQUIRE(test_insertion_and_deletion(pq, els, 1)); } @@ -202,12 +196,8 @@ TEST_CASE("New node created on partial insertion") auto popped_elements = pop_from_queue(pq, kInsertionSize); - std::sort(els.begin(), els.end()); - - std::vector correct_popped_elements(els.begin(), - els.begin() + kInsertionSize); - - REQUIRE(has_same_elements(popped_elements, correct_popped_elements)); + REQUIRE(is_valid_top_n>(popped_elements, els)); } @@ -233,16 +223,10 @@ TEST_CASE("Insert, delete, insert, delete", "") { auto second_popped_elements = pop_from_queue(pq, kSecondDeletionSize); - std::vector correct_first_deletion; + std::vector remaining_elements; std::sort(first_insertion_els.begin(), first_insertion_els.end(), Compare{}); - correct_first_deletion.insert(correct_first_deletion.end(), - first_insertion_els.begin(), - first_insertion_els.begin() + kFirstDeletionSize); - - std::vector remaining_elements; - remaining_elements.insert(remaining_elements.end(), first_insertion_els.begin() + kFirstDeletionSize, first_insertion_els.end()); @@ -251,16 +235,9 @@ TEST_CASE("Insert, delete, insert, delete", "") { second_insertion_els.begin(), second_insertion_els.end()); - std::sort(remaining_elements.begin(), remaining_elements.end(), Compare{}); - - std::vector correct_second_deletion; - - correct_second_deletion.insert(correct_second_deletion.end(), - remaining_elements.begin(), - remaining_elements.begin() + kSecondDeletionSize); - - REQUIRE((has_same_elements(correct_first_deletion, first_popped_elements) && - has_same_elements(correct_second_deletion, second_popped_elements))); + REQUIRE((is_valid_top_n(first_popped_elements, + first_insertion_els) && + is_valid_top_n(second_popped_elements, remaining_elements))); } @@ -308,12 +285,7 @@ TEST_CASE("Insertion and deletion on different streams", "") popped_elements.insert(popped_elements.end(), h_deletion2.begin(), h_deletion2.end()); - std::sort(elements.begin(), elements.end(), Compare{}); - - std::vector expected_popped_elements(elements.begin(), - elements.begin() + kDeletionSize * 2); - - REQUIRE(has_same_elements(popped_elements, expected_popped_elements)); + REQUIRE(is_valid_top_n(popped_elements, elements)); cudaStreamDestroy(stream1); cudaStreamDestroy(stream2); @@ -371,11 +343,8 @@ TEST_CASE("Insertion and deletion with Device API", "") std::vector pop_result(h_pop_result.begin(), h_pop_result.end()); - std::sort(els.begin(), els.end(), Compare{}); - - std::vector expected_pop_result(els.begin(), els.begin() + kDeletionSize); - REQUIRE(has_same_elements(pop_result, expected_pop_result)); + REQUIRE(is_valid_top_n(pop_result, els)); } TEST_CASE("Concurrent insertion and deletion with Device API", "") @@ -425,11 +394,7 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") std::vector result(h_deletion1.begin(), h_deletion1.end()); result.insert(result.end(), h_deletion2.begin(), h_deletion2.end()); - std::sort(els.begin(), els.end(), Compare{}); - - std::vector expected(els.begin(), els.begin() + kDeletionSize*2); - - REQUIRE(has_same_elements(result, expected)); + REQUIRE(is_valid_top_n(result, els)); cudaStreamDestroy(stream1); cudaStreamDestroy(stream2); From e3c4a277bd69a0a2afd9ffad1cdd20d817c8f5a3 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 21:44:17 +0000 Subject: [PATCH 23/55] Commenting and cleanup --- include/cuco/detail/priority_queue.inl | 3 + .../cuco/detail/priority_queue_kernels.cuh | 144 ------------------ include/cuco/priority_queue.cuh | 69 +++------ tests/priority_queue/priority_queue_test.cu | 15 +- 4 files changed, 28 insertions(+), 203 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 873560c34..11c951425 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -138,6 +138,9 @@ __device__ void priority_queue shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); + + auto push_size = last - first; + if (last - first == node_size_) { PushSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, lowest_level_start_, shmem, compare_); diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 194b1eeaa..ff4db8568 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -641,18 +641,6 @@ __device__ void Sink(CG const& g, lo = left; } - // If the heap property is already satisfied between the current - // node and the lower child, we are done return - // - // TODO: can this ever even occur? In the paper this is done because - // a max placeholder value is used to indicate unused nodes in the heap - if (!compare(heap[lo * node_size], - heap[(cur + 1) * node_size - 1])) { - ReleaseLock(g, &locks[lo]); - ReleaseLock(g, &locks[cur]); - return; - } - MergeAndSort(g, &heap[lo * node_size], &heap[cur * node_size], shmem.A, @@ -865,10 +853,6 @@ __device__ void PopPartialNode(CG const& g, int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); - //if (lane == 0) { - // printf("PopPartialNode lock acquired\n", *size); - // printf("Size: %d\n", *size); - //} if (*size == 0) { CopyPairs(g, elements, heap, num_elements); @@ -909,11 +893,6 @@ __device__ void PopPartialNode(CG const& g, if (lane == 0) { *p_buffer_size = *p_buffer_size - num_elements; - //printf("size: %d\n", *size); - //for (int i = 0; i < node_size; i++) { - // printf("%d ", heap[kPBufferIdx + i]); - //} - //printf("\n"); } g.sync(); @@ -952,11 +931,6 @@ __device__ void PopPartialNode(CG const& g, if (lane == 0) { *size -= 1; - //printf("size: %d\n", *size); - //for (int i = 0; i < node_size; i++) { - // printf("%d ", heap[kPBufferIdx + i]); - //} - //printf("\n"); } if (tar != kRootIdx) { @@ -979,21 +953,6 @@ __device__ void PopPartialNode(CG const& g, shmem, compare); g.sync(); - //if (lane == 0) { - // printf("shmem A:\n"); - // for (int i = 0; i < node_size; i++) { - // printf("%lu ", shmem.A[i]); - // } - // printf("\n"); - // printf("shmem B:\n"); - // for (int i = 0; i < node_size; i++) { - // printf("%lu ", shmem.B[i]); - // } - // printf("\n"); - // printf("p_buffer_size: %d\n", *p_buffer_size); - //} - - //g.sync(); CopyPairs(g, &heap[node_size], shmem.A, node_size); @@ -1001,20 +960,6 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - //if (lane == 0) { - // printf("size: %d\n", *size); - // for (int i = 0; i < node_size; i++) { - // printf("%lu ", heap[kPBufferIdx + i]); - // } - // printf("\n"); - // for (int i = 0; i < node_size; i++) { - // printf("%lu ", heap[node_size + i]); - // } - // printf("\n"); - //} - - //g.sync(); - Sink(g, heap, size, node_size, locks, p_buffer_size, lowest_level_start, node_capacity, shmem, compare); @@ -1276,72 +1221,6 @@ __global__ void PushKernel(OutputIt elements, // } //} -/** -* Remove exactly node_size elements from the heap and place them -* in elements, using a warp to handle each node rather than a block -* @param elements The array of elements to insert into -* @param num_elements The number of elements to remove -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size Number of pairs in the heap's partial buffer -*/ -//template -//__global__ void PopKernelWarp(OutputIt elements, -// size_t num_elements, -// T *heap, -// int *size, -// size_t node_size, -// int *locks, -// size_t *p_buffer_size, -// int lowest_level_start, -// int node_capacity, -// int bytes_shmem_per_warp, -// Compare compare) { -// -// extern __shared__ char sh[]; -// -// thread_block block = this_thread_block(); -// thread_block_tile<32> warp = tiled_partition<32>(block); -// -// SharedMemoryLayout shmem = GetSharedMemoryLayout( -// (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), -// 32, node_size); -// -// for (size_t i = warp.meta_group_rank() + (blockDim.x / 32) * blockIdx.x; -// i < num_elements / node_size; -// i += gridDim.x * blockDim.x / 32) { -// PopSingleNode(warp, elements, heap, size, node_size, locks, -// p_buffer_size, lowest_level_start, -// node_capacity, shmem, compare); -// } -// -// AcquireLock(warp, &locks[kRootIdx]); -// // Remove from the partial buffer if there are no nodes -// // Only one thread will attempt this deletion because we have acquired -// // the root and will increment pop_tracker once we begin the deletion -// if (*pop_tracker == num_elements / node_size -// && num_elements % node_size != 0) { -// -// if (warp.thread_rank() == 0) { -// *pop_tracker += 1; -// } -// -// size_t p_del_size = num_elements % node_size; -// -// ReleaseLock(warp, &locks[kRootIdx]); -// -// PopPartialNode(warp, -// elements + (num_elements / node_size) * node_size, -// p_del_size, heap, size, node_size, locks, p_buffer_size, -// lowest_level_start, node_capacity, shmem, compare); -// -// } else { -// ReleaseLock(warp, &locks[kRootIdx]); -// } -//} - template __global__ void PopPartialNodeKernel(OutputIt elements, size_t num_elements, @@ -1400,28 +1279,5 @@ __global__ void PopKernel(OutputIt elements, p_buffer_size, lowest_level_start, node_capacity, shmem, compare); } - - //AcquireLock(g, &locks[kRootIdx]); - // Remove from the partial buffer if there are no nodes - // Only one thread will attempt this deletion because we have acquired - // the root and will increment pop_tracker once we begin the deletion - //if (*pop_tracker == num_elements / node_size - // && num_elements % node_size != 0) { - - // if (g.thread_rank() == 0) { - // *pop_tracker += 1; - // } - - // size_t p_del_size = num_elements % node_size; - - // ReleaseLock(g, &locks[kRootIdx]); - - // PopPartialNode(g, elements + (num_elements / node_size) * node_size, - // p_del_size, heap, size, node_size, locks, p_buffer_size, - // lowest_level_start, node_capacity, shmem, compare); - // - //} else { - // ReleaseLock(g, &locks[kRootIdx]); - //} } } diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 9fac3276f..b8b50b636 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -15,7 +15,6 @@ namespace cuco { * deletions * * Current limitations: -* - Only supports trivially comparable key types * - Does not support insertion and deletion at the same time * - The implementation of the priority queue is based on * https://arxiv.org/pdf/1906.06504.pdf, which provides a way to allow @@ -35,17 +34,17 @@ namespace cuco { * The host-side bulk operations `push` and `pop` allow an arbitrary number of * elements to be pushed to or popped from the queue. * -* The device-side operations allow a cooperative group to push or pop -* some number of elements less than or equal to node_size. These device side +* The device-side operations allow a cooperative group to push or pop from +* device code. These device side * operations are invoked with a trivially-copyable device view, * `device_mutable_view` which can be obtained with the host function * `get_mutable_device_view` and passed to the device. * -* @tparam Key Trivially comparable type used for keys -* @tparam Value Type of the value to be stored -* @tparam Max When false, pop operations yield the elements with the smallest -* keys in the queue, otherwise, pop operations yeild the elements -* with the largest keys +* @tparam T Type of the elements stored in the queue +* @tparam Compare Comparison operator used to order the elements in the queue +* @tparam FavorInsertionPerformance When true, insertion performance is increased at the expense of + deletion performance. +* @tparam Allocator Allocator defining how memory is allocated internally */ template , bool FavorInsertionPerformance = false, @@ -68,8 +67,7 @@ class priority_queue { * @brief Construct a priority queue * * @param initial_capacity The number of elements the priority queue can hold - * @param node_size The size of the nodes in the underlying heap data - * structure + * @param alloc Allocator used for allocating device storage */ priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}); @@ -80,30 +78,19 @@ class priority_queue { * can be converted to T * @param first Beginning of the sequence of elements * @param last End of the sequence of elements - * @param num_elements Number of elements to add to the queue - * @param block_size Block size to use for the internal kernel launch - * @param grid_size Grid size for the internal kernel launch - * @param warp_size If true, each node is handled by a single warp, otherwise - * by a single block - * @param stream The stream in which the underlying GPU operations will be - * run + * @param stream The stream in which the underlying device operations will be + * executed */ template void push(InputIt first, InputIt last, cudaStream_t stream = 0); /** - * @brief Remove a sequence of the lowest (when Max == false) or the - * highest (when Max == true) elements + * @brief Remove a sequence of the lowest elements ordered by Compare * * @tparam OutputIt Device accessible output iterator whose `value_type` * can be converted to T * @param first Beginning of the sequence of output elements * @param last End of the sequence of output elements - * @param num_elements The number of elements to be removed - * @param block_size Block size to use for the internal kernel launch - * @param grid_size Grid size for the internal kernel launch - * @param warp_size If true, each node is handled by a single warp, otherwise - * by a single block * @param stream The stream in which the underlying GPU operations will be * run */ @@ -132,49 +119,37 @@ class priority_queue { public: /** - * @brief Push a single node or less elements into the priority queue + * @brief Push elements into the priority queue * * @tparam CG Cooperative Group type - * @tparam Device accessible iterator whose `value_type` is convertible - * to T + * @tparam InputIt Device accessible iterator whose `value_type` + * is convertible to T * @param g The cooperative group that will perform the operation * @param first The beginning of the sequence of elements to insert * @param last The end of the sequence of elements to insert - * @param Pointer to a contiguous section of memory large enough - * to hold get_shmem_size(g.size()) bytes + * @param temp_storage Pointer to a contiguous section of memory + * large enough to hold get_shmem_size(g.size()) bytes */ template __device__ void push(CG const& g, InputIt first, InputIt last, void *temp_storage); /** - * @brief Pop a single node or less elements from the priority queue + * @brief Pop elements from the priority queue * * @tparam CG Cooperative Group type - * @tparam Device accessible iterator whose `value_type` is convertible to - T + * @tparam OutputIt Device accessible iterator whose `value_type` + * is convertible to T * @param g The cooperative group that will perform the operation * @param first The beginning of the sequence of elements to output into * @param last The end of the sequence of elements to output into - * @param Pointer to a contiguous section of memory large enough - * to hold get_shmem_size(g.size()) bytes + * @param temp_storage Pointer to a contiguous section of memory + * large enough to hold get_shmem_size(g.size()) bytes */ template __device__ void pop(CG const& g, OutputIt first, OutputIt last, void *temp_storage); - /** - * @brief Returns the node size of the queue's underlying heap - * representation, i.e. the maximum number of elements - * pushable or poppable with a call to the device push - * and pop functions - * - * @return The underlying node size - */ - __device__ size_t get_node_size() { - return node_size_; - } - /* * @brief Return the amount of temporary storage required for operations * on the queue with a cooperative group size of block_size @@ -238,7 +213,7 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T *d_heap_; ///< Pointer to an array of nodes, the 0th node + T *d_heap_; ///< Pointer to an array of nodes, the 0th node /// being the heap's partial buffer, and nodes /// 1..(node_capacity_) being the heap, where the /// 1st node is the root diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 3cf5a5946..99a600fd5 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -1,25 +1,16 @@ -#include #include -#include -#include -#include -#include -#include -#include #include #include - -#include -#include -#include - #include #include #include + #include +#include + using namespace cuco; template From f6fa4840e5dc826d49e308bfdaf1a52c783289fa Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 21:58:18 +0000 Subject: [PATCH 24/55] Commenting for Compare --- .../cuco/detail/priority_queue_kernels.cuh | 78 +++++-------------- 1 file changed, 21 insertions(+), 57 deletions(-) diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index ff4db8568..0343ae07a 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -118,6 +118,7 @@ __device__ void CopyPairs(CG const& g, InputIt1 dst_start, * will be placed when the merge is completed * @param node_size The size of arrays a, b, lo, and hi * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements to be merged */ template __device__ void MergeAndSort(CG const& g, @@ -153,6 +154,7 @@ __device__ void MergeAndSort(CG const& g, * elements to insert into lo before starting insertion into * hi * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements to be merged */ template __device__ void MergeAndSort(CG const& g, @@ -306,6 +308,7 @@ __device__ void MergeAndSort(CG const& g, * temp can contain * @param temp A temporary array containing space for at least the nearest * power of two greater than len pairs +* @param compare Comparison operator ordering the elements to be sorted */ template __device__ void PBSort(CG const& g, T *start, size_t len, @@ -490,6 +493,7 @@ __device__ int RightChild(int x, int lowest_level_start) { * @param lowest_level_start Index of the first node in the last level of the * heap * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void Swim(CG const& g, @@ -554,6 +558,7 @@ __device__ void Swim(CG const& g, * heap * @param node_capacity Max capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void Sink(CG const& g, @@ -678,6 +683,7 @@ __device__ void Sink(CG const& g, * @param lowest_level_start Index of the first node in the last level of the * heap * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void PushSingleNode(CG const& g, @@ -733,6 +739,7 @@ __device__ void PushSingleNode(CG const& g, * heap * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void PopSingleNode(CG const& g, @@ -835,6 +842,7 @@ __device__ void PopSingleNode(CG const& g, * heap * @param node_capacity Maximum capacity of the heap in nodes * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void PopPartialNode(CG const& g, @@ -985,6 +993,7 @@ __device__ void PopPartialNode(CG const& g, * @param lowest_level_start Index of the first node in the last level of the * heap * @param shmem The shared memory layout for this cooperative group +* @param compare Comparison operator ordering the elements in the heap */ template __device__ void PushPartialNode(CG const& g, @@ -1112,6 +1121,8 @@ __device__ void PushPartialNode(CG const& g, * @param p_buffer_size Number of pairs in the heap's partial buffer * @param temp_node A temporary array large enough to store sizeof(T) * node_size bytes +* @param lowest_level_start The first index of the heaps lowest layer +* @param compare Comparison operator ordering the elements in the heap */ template __global__ void PushKernel(OutputIt elements, @@ -1159,68 +1170,18 @@ __global__ void PushKernel(OutputIt elements, } /** -* Add num_elements elements into the heap from -* elements, using a warp to handle each node rather than a block -* @param elements The array of elements to add -* @param num_elements The number of elements to be inserted +* Remove num_elements < node_size elements from the queue +* @param elements The array of elements to remove +* @param num_elements The number of elements to be removed * @param heap The array of pairs that stores the heap itself * @param size Pointer to the number of pairs currently in the heap * @param node_size Size of the nodes in the heap * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer -* @param temp_node A temporary array large enough to store - sizeof(T) * node_size bytes +* @param lowest_level_start The first index of the heaps lowest layer +* @param node_capacity The capacity of the heap in nodes +* @param compare Comparison operator ordering the elements in the heap */ -//template -//__global__ void PushKernelWarp(InputIt elements, -// size_t num_elements, -// T *heap, -// int *size, -// size_t node_size, -// int *locks, -// size_t *p_buffer_size, -// int lowest_level_start, -// int bytes_shmem_per_warp, -// Compare compare) { -// -// extern __shared__ char sh[]; -// -// // We push as many elements as possible as full nodes, -// // then deal with the remaining elements as a partial insertion -// // below -// thread_block block = this_thread_block(); -// thread_block_tile<32> warp = tiled_partition<32>(block); -// -// SharedMemoryLayout shmem = GetSharedMemoryLayout( -// (int*)(sh + bytes_shmem_per_warp * warp.meta_group_rank()), -// 32, node_size); -// -// for (size_t i = warp.meta_group_rank() * node_size -// + blockIdx.x * node_size * (blockDim.x / 32); -// i + node_size <= num_elements; -// i += (blockDim.x / 32) * node_size * gridDim.x) { -// PushSingleNode(warp, elements + i, heap, size, node_size, locks, -// lowest_level_start, shmem, compare); -// } -// -// // We only need one block for partial insertion -// if (blockIdx.x != 0 || warp.meta_group_rank() != 0) { -// return; -// } -// -// // If node_size does not divide num_elements, there are some leftover -// // elements for which we must perform a partial insertion -// size_t first_not_inserted = (num_elements / node_size) -// * node_size; -// -// if (first_not_inserted < num_elements) { -// size_t p_ins_size = num_elements - first_not_inserted; -// PushPartialNode(warp, elements + first_not_inserted, p_ins_size, -// heap, size, node_size, locks, p_buffer_size, -// lowest_level_start, shmem, compare); -// } -//} - template __global__ void PopPartialNodeKernel(OutputIt elements, size_t num_elements, @@ -1232,6 +1193,7 @@ __global__ void PopPartialNodeKernel(OutputIt elements, int lowest_level_start, int node_capacity, Compare compare) { + extern __shared__ int s[]; SharedMemoryLayout shmem = GetSharedMemoryLayout(s, @@ -1244,7 +1206,6 @@ __global__ void PopPartialNodeKernel(OutputIt elements, } - /** * Remove exactly node_size elements from the heap and place them * in elements @@ -1255,6 +1216,9 @@ __global__ void PopPartialNodeKernel(OutputIt elements, * @param node_size Size of the nodes in the heap * @param locks Array of locks, one for each node in the heap * @param p_buffer_size Number of pairs in the heap's partial buffer +* @param lowest_level_start The first index of the heaps lowest layer +* @param node_capacity The capacity of the heap in nodes +* @param compare Comparison operator ordering the elements in the heap */ template __global__ void PopKernel(OutputIt elements, From 599067fac6199948ae6f323ba7c4365ec64c95d3 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 23:29:35 +0000 Subject: [PATCH 25/55] Cleanup, arbitrary number of elements for device API functions --- include/cuco/detail/priority_queue.inl | 43 +++++++-------- .../cuco/detail/priority_queue_kernels.cuh | 55 ++++++------------- tests/priority_queue/priority_queue_test.cu | 4 +- 3 files changed, 39 insertions(+), 63 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 11c951425..cc9118061 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -100,26 +100,14 @@ void priority_queue>> - (first, partial, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, - lowest_level_start_, node_capacity_, compare_); - } - - pop_size -= partial; - first += partial; + const int kNumBlocks = min(64000, + max(1, (int)((pop_size - partial) / node_size_))); - if (pop_size > 0) { - PopKernel<<>> (first, pop_size, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, node_capacity_, compare_); - } CUCO_CUDA_TRY(cudaGetLastError()); } @@ -140,12 +128,14 @@ __device__ void priority_queue((int*)temp_storage, g.size(), node_size_); - if (last - first == node_size_) { - PopSingleNode(g, first, d_heap_, d_size_, node_size_, d_locks_, + auto pop_size = last - first; + for (size_t i = 0; i < pop_size / node_size_; i++) { + PopSingleNode(g, first + i * node_size_, + d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, node_capacity_, shmem, compare_); - } else { - PopPartialNode(g, first, last - first, d_heap_, d_size_, node_size_, + } + + if (pop_size % node_size_ != 0) { + PopPartialNode(g, first + (pop_size / node_size_) * node_size_, + last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, node_capacity_, shmem, compare_); } diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 0343ae07a..3896f49d5 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -1169,43 +1169,6 @@ __global__ void PushKernel(OutputIt elements, } } -/** -* Remove num_elements < node_size elements from the queue -* @param elements The array of elements to remove -* @param num_elements The number of elements to be removed -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size Number of pairs in the heap's partial buffer -* @param lowest_level_start The first index of the heaps lowest layer -* @param node_capacity The capacity of the heap in nodes -* @param compare Comparison operator ordering the elements in the heap -*/ -template -__global__ void PopPartialNodeKernel(OutputIt elements, - size_t num_elements, - T *heap, - int *size, - size_t node_size, - int *locks, - size_t *p_buffer_size, - int lowest_level_start, - int node_capacity, - Compare compare) { - - extern __shared__ int s[]; - - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, - blockDim.x, node_size); - - thread_block g = this_thread_block(); - PopPartialNode(g, elements, - num_elements, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem, compare); - -} - /** * Remove exactly node_size elements from the heap and place them * in elements @@ -1243,5 +1206,23 @@ __global__ void PopKernel(OutputIt elements, p_buffer_size, lowest_level_start, node_capacity, shmem, compare); } + + // We only need one block for partial deletion + if (blockIdx.x != 0) { + return; + } + + // If node_size does not divide num_elements, there are some leftover + // elements for which we must perform a partial deletion + size_t first_not_inserted = (num_elements / node_size) + * node_size; + + if (first_not_inserted < num_elements) { + size_t p_del_size = num_elements - first_not_inserted; + PopPartialNode(g, elements + first_not_inserted, + p_del_size, heap, size, node_size, locks, p_buffer_size, + lowest_level_start, node_capacity, shmem, compare); + } } + } diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 99a600fd5..549cdd02b 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -305,8 +305,8 @@ __global__ void DeviceAPIDelete( TEST_CASE("Insertion and deletion with Device API", "") { - const size_t kInsertionSize = 1000; - const size_t kDeletionSize = 500; + const size_t kInsertionSize = 2000; + const size_t kDeletionSize = 1000; using T = uint32_t; using Compare = thrust::less; From 44db3405a62ccac6e5b366ac895b48d93d4d2bbe Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 27 Dec 2021 23:49:08 +0000 Subject: [PATCH 26/55] Formatting --- include/cuco/detail/priority_queue.inl | 78 +++++++++---------- .../cuco/detail/priority_queue_kernels.cuh | 56 ++++++------- include/cuco/priority_queue.cuh | 39 +++++----- 3 files changed, 87 insertions(+), 86 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index cc9118061..720c626c0 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -7,18 +7,18 @@ namespace cuco { template + typename Allocator> priority_queue::priority_queue (size_t initial_capacity, - Allocator const& allocator) : - allocator_{allocator}, - int_allocator_{allocator}, - t_allocator_{allocator}, - size_t_allocator_{allocator} { + Allocator const& allocator) : + allocator_{allocator}, + int_allocator_{allocator}, + t_allocator_{allocator}, + size_t_allocator_{allocator} { node_size_ = NodeSize; - + // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); @@ -28,21 +28,21 @@ priority_queue::allocate(int_allocator_, - 1); + 1); CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); - d_p_buffer_size_ = std::allocator_traits::allocate( - size_t_allocator_, - 1); + d_p_buffer_size_ = std::allocator_traits + ::allocate(size_t_allocator_, 1); CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); - d_heap_ = std::allocator_traits::allocate(t_allocator_, - node_capacity_ * node_size_ + node_size_); + d_heap_ = std::allocator_traits + ::allocate(t_allocator_, + node_capacity_ * node_size_ + node_size_); - d_locks_ = std::allocator_traits::allocate(int_allocator_, - node_capacity_ + 1); + d_locks_ = std::allocator_traits + ::allocate(int_allocator_, node_capacity_ + 1); CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); @@ -51,57 +51,57 @@ priority_queue + typename Allocator> priority_queue::~priority_queue() { + Allocator>::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, - d_size_, 1); + d_size_, 1); std::allocator_traits::deallocate(size_t_allocator_, - d_p_buffer_size_, 1); + d_p_buffer_size_, 1); std::allocator_traits::deallocate(t_allocator_, - d_heap_, - node_capacity_ * node_size_ + node_size_); + d_heap_, + node_capacity_ * node_size_ + node_size_); std::allocator_traits::deallocate(int_allocator_, - d_locks_, - node_capacity_ + 1); + d_locks_, + node_capacity_ + 1); } template + typename Allocator> template void priority_queue::push(InputIt first, InputIt last, - cudaStream_t stream) { + cudaStream_t stream) { const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, - max(1, (int)((last - first) / node_size_))); + max(1, (int)((last - first) / node_size_))); PushKernel<<>> (first, last - first, d_heap_, d_size_, node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, - compare_); + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } template + typename Allocator> template void priority_queue::pop(OutputIt first, OutputIt last, - cudaStream_t stream) { - + cudaStream_t stream) { + int pop_size = last - first; const int partial = pop_size % node_size_; const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, - max(1, (int)((pop_size - partial) / node_size_))); + max(1, (int)((pop_size - partial) / node_size_))); PopKernel<<>> @@ -113,10 +113,10 @@ void priority_queue + typename Allocator> template __device__ void priority_queue + FavorInsertionPerformance, Allocator> ::device_mutable_view::push( CG const& g, InputIt first, @@ -135,18 +135,18 @@ __device__ void priority_queue + typename Allocator> template __device__ void priority_queue + FavorInsertionPerformance, Allocator> ::device_mutable_view::pop( CG const& g, OutputIt first, @@ -159,14 +159,14 @@ __device__ void priority_queue GetSharedMemoryLayout( result.intersections = s; result.A = (T*)(s + 2 * (dim + 1)); result.B = result.A + node_size; - return result; + return result; } /** @@ -128,9 +128,9 @@ __device__ void MergeAndSort(CG const& g, T *hi, size_t node_size, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { MergeAndSort(g, a, b, lo, hi, node_size, - node_size, node_size, shmem, compare); + node_size, node_size, shmem, compare); } /** @@ -166,7 +166,7 @@ __device__ void MergeAndSort(CG const& g, size_t num_elements_b, size_t node_size, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -314,7 +314,7 @@ template __device__ void PBSort(CG const& g, T *start, size_t len, size_t node_size, T *temp, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); @@ -504,7 +504,7 @@ __device__ void Swim(CG const& g, int *locks, int lowest_level_start, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -529,7 +529,7 @@ __device__ void Swim(CG const& g, shmem.B, node_size, shmem, - compare); + compare); g.sync(); @@ -542,11 +542,11 @@ __device__ void Swim(CG const& g, } ReleaseLock(g, &(locks[cur_node])); - + } /** -* Sink the root down the heap +* Sink the root down the heap * Pre: g must hold the root's lock * * @param g The cooperative group that will perform the operation @@ -570,7 +570,7 @@ __device__ void Sink(CG const& g, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { size_t cur = kRootIdx; @@ -628,7 +628,7 @@ __device__ void Sink(CG const& g, shmem.B, node_size, shmem, - compare); + compare); g.sync(); @@ -652,7 +652,7 @@ __device__ void Sink(CG const& g, shmem.B, node_size, shmem, - compare); + compare); g.sync(); @@ -667,7 +667,7 @@ __device__ void Sink(CG const& g, } ReleaseLock(g, &locks[cur]); - + } /** @@ -694,7 +694,7 @@ __device__ void PushSingleNode(CG const& g, int *locks, int lowest_level_start, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -752,7 +752,7 @@ __device__ void PopSingleNode(CG const& g, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -812,7 +812,7 @@ __device__ void PopSingleNode(CG const& g, *p_buffer_size, node_size, shmem, - compare); + compare); g.sync(); @@ -856,7 +856,7 @@ __device__ void PopPartialNode(CG const& g, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -895,7 +895,7 @@ __device__ void PopPartialNode(CG const& g, node_size - num_elements, node_size, shmem, - compare); + compare); g.sync(); @@ -922,7 +922,7 @@ __device__ void PopPartialNode(CG const& g, node_size - num_elements, node_size, shmem, - compare); + compare); g.sync(); @@ -959,7 +959,7 @@ __device__ void PopPartialNode(CG const& g, *p_buffer_size, node_size, shmem, - compare); + compare); g.sync(); CopyPairs(g, &heap[node_size], shmem.A, node_size); @@ -970,7 +970,7 @@ __device__ void PopPartialNode(CG const& g, Sink(g, heap, size, node_size, locks, p_buffer_size, lowest_level_start, node_capacity, shmem, - compare); + compare); } else { ReleaseLock(g, &locks[kRootIdx]); } @@ -1003,10 +1003,10 @@ __device__ void PushPartialNode(CG const& g, int *size, size_t node_size, int *locks, - size_t *p_buffer_size, + size_t *p_buffer_size, int lowest_level_start, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -1043,7 +1043,7 @@ __device__ void PushPartialNode(CG const& g, *p_buffer_size, node_size, shmem, - compare); + compare); if (lane == 0) { *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; @@ -1074,7 +1074,7 @@ __device__ void PushPartialNode(CG const& g, *p_buffer_size, node_size, shmem, - compare); + compare); g.sync(); @@ -1097,7 +1097,7 @@ __device__ void PushPartialNode(CG const& g, *p_buffer_size, node_size, shmem, - compare); + compare); g.sync(); CopyPairs(g, heap, shmem.B, *p_buffer_size); @@ -1133,7 +1133,7 @@ __global__ void PushKernel(OutputIt elements, int *locks, size_t *p_buffer_size, int lowest_level_start, - Compare compare) { + Compare compare) { extern __shared__ int s[]; @@ -1193,7 +1193,7 @@ __global__ void PopKernel(OutputIt elements, size_t *p_buffer_size, int lowest_level_start, int node_capacity, - Compare compare) { + Compare compare) { extern __shared__ int s[]; diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index b8b50b636..42e15d82d 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -16,13 +16,13 @@ namespace cuco { * * Current limitations: * - Does not support insertion and deletion at the same time -* - The implementation of the priority queue is based on +* - The implementation of the priority queue is based on * https://arxiv.org/pdf/1906.06504.pdf, which provides a way to allow * concurrent insertion and deletion, so this could be added later if useful * - Capacity is fixed and the queue does not automatically resize * - Deletion from the queue is much slower than insertion into the queue * due to congestion at the underlying heap's root node -* +* * The queue supports two operations: * `push`: Add elements into the queue * `pop`: Remove the element(s) with the lowest (when Max == false) or highest @@ -34,31 +34,31 @@ namespace cuco { * The host-side bulk operations `push` and `pop` allow an arbitrary number of * elements to be pushed to or popped from the queue. * -* The device-side operations allow a cooperative group to push or pop from +* The device-side operations allow a cooperative group to push or pop from * device code. These device side * operations are invoked with a trivially-copyable device view, -* `device_mutable_view` which can be obtained with the host function +* `device_mutable_view` which can be obtained with the host function * `get_mutable_device_view` and passed to the device. * * @tparam T Type of the elements stored in the queue * @tparam Compare Comparison operator used to order the elements in the queue -* @tparam FavorInsertionPerformance When true, insertion performance is increased at the expense of - deletion performance. +* @tparam FavorInsertionPerformance When true, insertion performance is +* increased at the expense of deletion performance. * @tparam Allocator Allocator defining how memory is allocated internally */ template , - bool FavorInsertionPerformance = false, - typename Allocator = cuco::cuda_allocator> + bool FavorInsertionPerformance = false, + typename Allocator = cuco::cuda_allocator> class priority_queue { using int_allocator_type = typename std::allocator_traits - ::rebind_alloc; + ::rebind_alloc; using t_allocator_type = typename std::allocator_traits - ::rebind_alloc; - + ::rebind_alloc; + using size_t_allocator_type = typename std::allocator_traits - ::rebind_alloc; + ::rebind_alloc; const int NodeSize = FavorInsertionPerformance ? 64 : 1024; @@ -69,7 +69,8 @@ class priority_queue { * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage */ - priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}); + priority_queue(size_t initial_capacity, + Allocator const& alloc = Allocator{}); /** * @brief Push elements into the priority queue @@ -98,8 +99,8 @@ class priority_queue { void pop(OutputIt first, OutputIt last, cudaStream_t stream = 0); /* - * @brief Return the amount of shared memory required for operations on the queue - * with a thread block size of block_size + * @brief Return the amount of shared memory required for operations + * on the queue with a thread block size of block_size * * @param block_size Size of the blocks to calculate storage for * @return The amount of temporary storage required in bytes @@ -170,7 +171,7 @@ class priority_queue { int *d_locks, int lowest_level_start, int node_capacity, - Compare const& compare) + Compare const& compare) : node_size_(node_size), d_heap_(d_heap), d_size_(d_size), @@ -178,7 +179,7 @@ class priority_queue { d_locks_(d_locks), lowest_level_start_(lowest_level_start), node_capacity_(node_capacity), - compare_(compare) + compare_(compare) { } @@ -195,7 +196,7 @@ class priority_queue { }; /* - * @brief Returns a trivially-copyable class that can be used to perform + * @brief Returns a trivially-copyable class that can be used to perform * insertion and deletion of single nodes in device code with * cooperative groups * @@ -204,7 +205,7 @@ class priority_queue { device_mutable_view get_mutable_device_view() { return device_mutable_view(node_size_, d_heap_, d_size_, d_p_buffer_size_, d_locks_, lowest_level_start_, node_capacity_, - compare_); + compare_); } private: From acfdf7effa9becb6f8d0325d543d62c175fc1a17 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Tue, 12 Apr 2022 06:58:29 +0000 Subject: [PATCH 27/55] Add missing syncs --- include/cuco/detail/priority_queue_kernels.cuh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index a71c609cf..10afeadb1 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -536,6 +536,8 @@ __device__ void Swim(CG const& g, CopyPairs(g, &heap[parent * node_size], shmem.A, node_size); CopyPairs(g, &heap[cur_node * node_size], shmem.B, node_size); + g.sync(); + ReleaseLock(g, &(locks[cur_node])); cur_node = parent; parent = Parent(cur_node, lowest_level_start); @@ -1103,6 +1105,8 @@ __device__ void PushPartialNode(CG const& g, CopyPairs(g, heap, shmem.B, *p_buffer_size); CopyPairs(g, &heap[node_size], shmem.A, node_size); + + g.sync(); } ReleaseLock(g, &locks[kRootIdx]); } From 71775b6176aa6d18dd439f931c8fb7b96e3da473 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 14 Apr 2022 03:57:42 +0000 Subject: [PATCH 28/55] [pre-commit.ci] auto code formatting --- .../priority_queue/priority_queue_bench.cu | 53 +- include/cuco/detail/priority_queue.inl | 229 ++-- .../cuco/detail/priority_queue_kernels.cuh | 1154 ++++++++--------- include/cuco/priority_queue.cuh | 195 +-- tests/priority_queue/priority_queue_test.cu | 227 ++-- 5 files changed, 898 insertions(+), 960 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index a7459a26e..bd4288203 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -1,11 +1,11 @@ -#include #include #include +#include #include -#include #include +#include #include @@ -13,13 +13,12 @@ using namespace cuco; template struct pair_less { - __host__ __device__ bool operator()(const T& a, const T& b) const { - return a.first < b.first; - } + __host__ __device__ bool operator()(const T& a, const T& b) const { return a.first < b.first; } }; -template -static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { +template +static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) +{ std::random_device rd; std::mt19937 gen{rd()}; @@ -30,15 +29,14 @@ static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { } } -template +template static void BM_insert(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>, - FavorInsertionPerformance> pq(NumKeys); + priority_queue, pair_less>, FavorInsertionPerformance> pq( + NumKeys); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -48,18 +46,16 @@ static void BM_insert(::benchmark::State& state) pq.push(d_pairs.begin(), d_pairs.end()); cudaDeviceSynchronize(); } - } -template +template static void BM_delete(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>, - FavorInsertionPerformance> pq(NumKeys); + priority_queue, pair_less>, FavorInsertionPerformance> pq( + NumKeys); std::vector> h_pairs(NumKeys); generate_keys_uniform(h_pairs.begin(), h_pairs.end()); @@ -72,29 +68,20 @@ static void BM_delete(::benchmark::State& state) pq.pop(d_pairs.begin(), d_pairs.end()); cudaDeviceSynchronize(); } - } -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true) - ->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true)->Unit(benchmark::kMillisecond); diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 720c626c0..78e6352b2 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -1,175 +1,178 @@ #pragma once #include -#include #include +#include namespace cuco { -template -priority_queue::priority_queue - (size_t initial_capacity, - Allocator const& allocator) : - allocator_{allocator}, - int_allocator_{allocator}, - t_allocator_{allocator}, - size_t_allocator_{allocator} { - +template +priority_queue::priority_queue( + size_t initial_capacity, Allocator const& allocator) + : allocator_{allocator}, + int_allocator_{allocator}, + t_allocator_{allocator}, + size_t_allocator_{allocator} +{ node_size_ = NodeSize; // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); - node_capacity_ = nodes; + node_capacity_ = nodes; lowest_level_start_ = 1 << (int)log2(nodes); // Allocate device variables - d_size_ = std::allocator_traits::allocate(int_allocator_, - 1); + d_size_ = std::allocator_traits::allocate(int_allocator_, 1); CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); - d_p_buffer_size_ = std::allocator_traits - ::allocate(size_t_allocator_, 1); + d_p_buffer_size_ = std::allocator_traits::allocate(size_t_allocator_, 1); CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); - d_heap_ = std::allocator_traits - ::allocate(t_allocator_, - node_capacity_ * node_size_ + node_size_); - - d_locks_ = std::allocator_traits - ::allocate(int_allocator_, node_capacity_ + 1); - - CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, - sizeof(int) * (node_capacity_ + 1))); + d_heap_ = std::allocator_traits::allocate( + t_allocator_, node_capacity_ * node_size_ + node_size_); + d_locks_ = + std::allocator_traits::allocate(int_allocator_, node_capacity_ + 1); + CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); } -template -priority_queue::~priority_queue() { - std::allocator_traits::deallocate(int_allocator_, - d_size_, 1); - std::allocator_traits::deallocate(size_t_allocator_, - d_p_buffer_size_, 1); - std::allocator_traits::deallocate(t_allocator_, - d_heap_, - node_capacity_ * node_size_ + node_size_); - std::allocator_traits::deallocate(int_allocator_, - d_locks_, - node_capacity_ + 1); +template +priority_queue::~priority_queue() +{ + std::allocator_traits::deallocate(int_allocator_, d_size_, 1); + std::allocator_traits::deallocate(size_t_allocator_, d_p_buffer_size_, 1); + std::allocator_traits::deallocate( + t_allocator_, d_heap_, node_capacity_ * node_size_ + node_size_); + std::allocator_traits::deallocate( + int_allocator_, d_locks_, node_capacity_ + 1); } - -template +template template -void priority_queue::push(InputIt first, - InputIt last, - cudaStream_t stream) { - +void priority_queue::push(InputIt first, + InputIt last, + cudaStream_t stream) +{ const int kBlockSize = min(256, (int)node_size_); - const int kNumBlocks = min(64000, - max(1, (int)((last - first) / node_size_))); - - PushKernel<<>> - (first, last - first, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, - compare_); + const int kNumBlocks = min(64000, max(1, (int)((last - first) / node_size_))); + + PushKernel<<>>(first, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, - OutputIt last, - cudaStream_t stream) { - - int pop_size = last - first; +void priority_queue::pop(OutputIt first, + OutputIt last, + cudaStream_t stream) +{ + int pop_size = last - first; const int partial = pop_size % node_size_; const int kBlockSize = min(256, (int)node_size_); - const int kNumBlocks = min(64000, - max(1, (int)((pop_size - partial) / node_size_))); - - PopKernel<<>> - (first, pop_size, d_heap_, d_size_, - node_size_, d_locks_, d_p_buffer_size_, - lowest_level_start_, node_capacity_, compare_); + const int kNumBlocks = min(64000, max(1, (int)((pop_size - partial) / node_size_))); + + PopKernel<<>>(first, + pop_size, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -__device__ void priority_queue - ::device_mutable_view::push( - CG const& g, - InputIt first, - InputIt last, - void *temp_storage) { - - SharedMemoryLayout shmem = - GetSharedMemoryLayout((int*)temp_storage, - g.size(), node_size_); +__device__ void +priority_queue::device_mutable_view::push( + CG const& g, InputIt first, InputIt last, void* temp_storage) +{ + SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); auto push_size = last - first; for (size_t i = 0; i < push_size / node_size_; i++) { - PushSingleNode(g, first + i * node_size_, d_heap_, d_size_, node_size_, - d_locks_, lowest_level_start_, shmem, compare_); + PushSingleNode(g, + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + lowest_level_start_, + shmem, + compare_); } if (push_size % node_size_ != 0) { - PushPartialNode(g, first + (push_size / node_size_) * node_size_, - push_size % node_size_, d_heap_, - d_size_, node_size_, d_locks_, - d_p_buffer_size_, lowest_level_start_, shmem, - compare_); + PushPartialNode(g, + first + (push_size / node_size_) * node_size_, + push_size % node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + shmem, + compare_); } } -template +template template -__device__ void priority_queue - ::device_mutable_view::pop( - CG const& g, - OutputIt first, - OutputIt last, - void *temp_storage) { - SharedMemoryLayout shmem = - GetSharedMemoryLayout((int*)temp_storage, - g.size(), node_size_); +__device__ void +priority_queue::device_mutable_view::pop( + CG const& g, OutputIt first, OutputIt last, void* temp_storage) +{ + SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); auto pop_size = last - first; for (size_t i = 0; i < pop_size / node_size_; i++) { - PopSingleNode(g, first + i * node_size_, - d_heap_, d_size_, node_size_, d_locks_, - d_p_buffer_size_, lowest_level_start_, - node_capacity_, shmem, compare_); + PopSingleNode(g, + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } if (pop_size % node_size_ != 0) { - PopPartialNode(g, first + (pop_size / node_size_) * node_size_, - last - first, d_heap_, d_size_, node_size_, - d_locks_, d_p_buffer_size_, lowest_level_start_, - node_capacity_, shmem, compare_); + PopPartialNode(g, + first + (pop_size / node_size_) * node_size_, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } } -} +} // namespace cuco diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 10afeadb1..e67a9a01f 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -1,188 +1,183 @@ #pragma once -#include #include +#include using namespace cooperative_groups; namespace cuco { constexpr int kPBufferIdx = 0; -constexpr int kRootIdx = 1; +constexpr int kRootIdx = 1; /* -* Struct to hold pointers to the temp storage used by the priority -* queue's kernels and functions. -* Ideally, this temp storage is in shared memory -*/ + * Struct to hold pointers to the temp storage used by the priority + * queue's kernels and functions. + * Ideally, this temp storage is in shared memory + */ template struct SharedMemoryLayout { - int *intersections; - T *A; - T *B; + int* intersections; + T* A; + T* B; }; /* -* Get the shared memory layout for a given group dimension -* and node size. -* -* @param s Pointer to the beginning of the section of shared memory to -* partition -* @param dim Size of the cooperative group the memory will be used by -* @param node_size Size of the nodes in this priority queue -* @returns The memory layout for the given group dimension and node size -*/ + * Get the shared memory layout for a given group dimension + * and node size. + * + * @param s Pointer to the beginning of the section of shared memory to + * partition + * @param dim Size of the cooperative group the memory will be used by + * @param node_size Size of the nodes in this priority queue + * @returns The memory layout for the given group dimension and node size + */ template -__device__ SharedMemoryLayout GetSharedMemoryLayout( - int *s, int dim, size_t node_size) { - +__device__ SharedMemoryLayout GetSharedMemoryLayout(int* s, int dim, size_t node_size) +{ SharedMemoryLayout result; result.intersections = s; - result.A = (T*)(s + 2 * (dim + 1)); - result.B = result.A + node_size; + result.A = (T*)(s + 2 * (dim + 1)); + result.B = result.A + node_size; return result; } /** -* Acquires lock l for the current thread block -* The entire thread block must call the function -* -* @param g The cooperative group that will acquire the lock -* @param l Pointer to the lock to be acquired -*/ + * Acquires lock l for the current thread block + * The entire thread block must call the function + * + * @param g The cooperative group that will acquire the lock + * @param l Pointer to the lock to be acquired + */ template -__device__ void AcquireLock(CG const& g, int *l) { +__device__ void AcquireLock(CG const& g, int* l) +{ if (g.thread_rank() == 0) { - while (atomicCAS(l, 0, 1) != 0); + while (atomicCAS(l, 0, 1) != 0) + ; } __threadfence(); g.sync(); } /** -* Releases lock l for the current thread block -* -* @param g The cooperative group that will release the lock -* @param l Pointer to the lock to be released -*/ + * Releases lock l for the current thread block + * + * @param g The cooperative group that will release the lock + * @param l Pointer to the lock to be released + */ template -__device__ void ReleaseLock(CG const& g, int *l) { - if (g.thread_rank() == 0) { - atomicExch(l, 0); - } +__device__ void ReleaseLock(CG const& g, int* l) +{ + if (g.thread_rank() == 0) { atomicExch(l, 0); } } /** -* Copy pairs from src to dst -* -* @param g The cooperative group that will perform the copy -* @param dst_start Iterator to the beginning of the destination array -* @param src_start Iterator to the beginning of the source array -* @param src_end Iterator to the end of the source array -*/ + * Copy pairs from src to dst + * + * @param g The cooperative group that will perform the copy + * @param dst_start Iterator to the beginning of the destination array + * @param src_start Iterator to the beginning of the source array + * @param src_end Iterator to the end of the source array + */ template -__device__ void CopyPairs(CG const& g, InputIt1 dst_start, - InputIt2 src_start, InputIt2 src_end) { +__device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, InputIt2 src_end) +{ auto dst = dst_start + g.thread_rank(); - for (auto src = src_start + g.thread_rank(); - src < src_end; dst += g.size(), src += g.size()) { + for (auto src = src_start + g.thread_rank(); src < src_end; dst += g.size(), src += g.size()) { *dst = *src; } } /** -* Copy node_size pairs from src to dst -* -* @param g The cooperative group that will perform the copy -* @param dst_start Iterator to the beginning of the destination array -* @param src_start Iterator to the beginning of the source array -* @param num_pairs Number of pairs to copy -*/ + * Copy node_size pairs from src to dst + * + * @param g The cooperative group that will perform the copy + * @param dst_start Iterator to the beginning of the destination array + * @param src_start Iterator to the beginning of the source array + * @param num_pairs Number of pairs to copy + */ template -__device__ void CopyPairs(CG const& g, InputIt1 dst_start, - InputIt2 src_start, size_t num_pairs) { +__device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, size_t num_pairs) +{ CopyPairs(g, dst_start, src_start, src_start + num_pairs); } /** -* Merge arrays a and b of size node_size by key, putting the -* node_size elements with the lowest keys in lo, sorted by key, and the -* node_size elements with the highest keys in hi, sorted by key -* -* @param g The cooperative group that will perform the merge and sort -* @param a The first array of pairs to be merged, sorted by key -* @param b The second array of pairs to be merged, sorted by key -* @param lo The array in which the node_size elements with the lowest keys -* will be placed when the merge is completed -* @param hi The array in which the node_size elements with the highest keys -* will be placed when the merge is completed -* @param node_size The size of arrays a, b, lo, and hi -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements to be merged -*/ + * Merge arrays a and b of size node_size by key, putting the + * node_size elements with the lowest keys in lo, sorted by key, and the + * node_size elements with the highest keys in hi, sorted by key + * + * @param g The cooperative group that will perform the merge and sort + * @param a The first array of pairs to be merged, sorted by key + * @param b The second array of pairs to be merged, sorted by key + * @param lo The array in which the node_size elements with the lowest keys + * will be placed when the merge is completed + * @param hi The array in which the node_size elements with the highest keys + * will be placed when the merge is completed + * @param node_size The size of arrays a, b, lo, and hi + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements to be merged + */ template __device__ void MergeAndSort(CG const& g, - T *a, - T *b, - T *lo, - T *hi, - size_t node_size, - SharedMemoryLayout shmem, - Compare const& compare) { - MergeAndSort(g, a, b, lo, hi, node_size, - node_size, node_size, shmem, compare); + T* a, + T* b, + T* lo, + T* hi, + size_t node_size, + SharedMemoryLayout shmem, + Compare const& compare) +{ + MergeAndSort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); } /** -* Merge array a of size num_elements_a and array b of size num_elements_b -* by key. If num_elements_a + num_elements_b <= node_size, all merged elements -* will be placed in lo. Otherwise, the node_size lowest merged elements will -* be placed in lo, and the rest of the elements will be placed in hi. -* -* @param g The cooperative group that will perform the merge and sort -* @param a The first array of pairs to be merged, sorted by key -* @param b The second array of pairs to be merged, sorted by key -* @param lo The array in which the node_size elements with the lowest keys -* will be placed when the merge is completed -* @param hi The array in which the node_size elements with the highest keys -* will be placed when the merge is completed, -* if num_elements_a + num_elements_b > node_size. May be nullptr in -* the case that num_elements_a + num_elements_b < node_size. -* @param num_elements_a The number of pairs in array a -* @param num_elements_b The number of pairs in array b -* @param node_size The size of arrays hi and lo, in other words how many -* elements to insert into lo before starting insertion into -* hi -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements to be merged -*/ + * Merge array a of size num_elements_a and array b of size num_elements_b + * by key. If num_elements_a + num_elements_b <= node_size, all merged elements + * will be placed in lo. Otherwise, the node_size lowest merged elements will + * be placed in lo, and the rest of the elements will be placed in hi. + * + * @param g The cooperative group that will perform the merge and sort + * @param a The first array of pairs to be merged, sorted by key + * @param b The second array of pairs to be merged, sorted by key + * @param lo The array in which the node_size elements with the lowest keys + * will be placed when the merge is completed + * @param hi The array in which the node_size elements with the highest keys + * will be placed when the merge is completed, + * if num_elements_a + num_elements_b > node_size. May be nullptr in + * the case that num_elements_a + num_elements_b < node_size. + * @param num_elements_a The number of pairs in array a + * @param num_elements_b The number of pairs in array b + * @param node_size The size of arrays hi and lo, in other words how many + * elements to insert into lo before starting insertion into + * hi + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements to be merged + */ template __device__ void MergeAndSort(CG const& g, - T *a, - T *b, - T *lo, - T *hi, - size_t num_elements_a, - size_t num_elements_b, - size_t node_size, - SharedMemoryLayout shmem, - Compare const& compare) { - + T* a, + T* b, + T* lo, + T* hi, + size_t num_elements_a, + size_t num_elements_b, + size_t node_size, + SharedMemoryLayout shmem, + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); - - if (num_elements_a == node_size && - compare(a[node_size - 1], b[0])) { + int dim = g.size(); + if (num_elements_a == node_size && compare(a[node_size - 1], b[0])) { CopyPairs(g, lo, a, num_elements_a); CopyPairs(g, hi, b, num_elements_b); return; } - if (num_elements_b == node_size && - compare(b[node_size - 1], a[0])) { - + if (num_elements_b == node_size && compare(b[node_size - 1], a[0])) { CopyPairs(g, hi, a, num_elements_a); CopyPairs(g, lo, b, num_elements_b); @@ -190,14 +185,13 @@ __device__ void MergeAndSort(CG const& g, } // Array of size 2 * (blockDim.x + 1) - int *intersections = shmem.intersections; - + int* intersections = shmem.intersections; if (lane == 0) { intersections[0] = 0; intersections[1] = 0; - intersections[2 * dim] = node_size; + intersections[2 * dim] = node_size; intersections[2 * dim + 1] = node_size; } @@ -221,7 +215,6 @@ __device__ void MergeAndSort(CG const& g, // Binary search along the diagonal while (leftmost_zero - rightmost_one > 1) { - int i = (rightmost_one + leftmost_zero) / 2; int j = (p * lane - 1) - i; @@ -232,13 +225,10 @@ __device__ void MergeAndSort(CG const& g, } else { leftmost_zero = i; } - } - intersections[2 * lane] = leftmost_zero; - intersections[2 * lane + 1] = (p * lane - 1) - - leftmost_zero + 1; - + intersections[2 * lane] = leftmost_zero; + intersections[2 * lane + 1] = (p * lane - 1) - leftmost_zero + 1; } g.sync(); @@ -249,8 +239,7 @@ __device__ void MergeAndSort(CG const& g, // Get the intersection that ends this partition int i_max = min(intersections[2 * (lane + 1)], (int)num_elements_a); - int j_max = min(intersections[2 * (lane + 1) + 1], - (int)num_elements_b); + int j_max = min(intersections[2 * (lane + 1) + 1], (int)num_elements_b); // Insert location into the output array int ins_loc = lane * p; @@ -299,28 +288,25 @@ __device__ void MergeAndSort(CG const& g, } /** -* Sorts the len pairs at start by key -* -* @param g The cooperative group that will perform the sort -* @param start Pointer to the array to be sorted -* @param len Number of pairs to be sorted -* @param node_size A power of two corresponding to the number of pairs -* temp can contain -* @param temp A temporary array containing space for at least the nearest -* power of two greater than len pairs -* @param compare Comparison operator ordering the elements to be sorted -*/ + * Sorts the len pairs at start by key + * + * @param g The cooperative group that will perform the sort + * @param start Pointer to the array to be sorted + * @param len Number of pairs to be sorted + * @param node_size A power of two corresponding to the number of pairs + * temp can contain + * @param temp A temporary array containing space for at least the nearest + * power of two greater than len pairs + * @param compare Comparison operator ordering the elements to be sorted + */ template -__device__ void PBSort(CG const& g, T *start, size_t len, - size_t node_size, - T *temp, - Compare const& compare) { - - +__device__ void PBSort( + CG const& g, T* start, size_t len, size_t node_size, T* temp, Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); - char *mask = (char*)temp; + char* mask = (char*)temp; for (int i = lane; i < node_size; i += dim) { mask[i] = i < len; @@ -332,29 +318,27 @@ __device__ void PBSort(CG const& g, T *start, size_t len, for (int jump = width / 2; jump >= 1; jump /= 2) { for (int i = lane; i < node_size / 2; i += dim) { int start_jump = width / 2; - int left = (i / jump) * jump * 2 + i % jump; - int right = left + jump; + int left = (i / jump) * jump * 2 + i % jump; + int right = left + jump; if ((i / start_jump) % 2 == 0) { - if (!mask[left] || (mask[right] && - !compare(start[left], start[right]))) { - auto temp = start[left]; - start[left] = start[right]; + if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; start[right] = temp; auto temp_mask = mask[left]; - mask[left] = mask[right]; - mask[right] = temp_mask; + mask[left] = mask[right]; + mask[right] = temp_mask; } } else { - if (!mask[right] || (mask[left] - && compare(start[left], start[right]))) { - auto temp = start[left]; - start[left] = start[right]; + if (!mask[right] || (mask[left] && compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; start[right] = temp; auto temp_mask = mask[left]; - mask[left] = mask[right]; - mask[right] = temp_mask; + mask[left] = mask[right]; + mask[right] = temp_mask; } } } @@ -365,17 +349,16 @@ __device__ void PBSort(CG const& g, T *start, size_t len, // Merge to get the sorted result for (int jump = node_size / 2; jump >= 1; jump /= 2) { for (int i = lane; i < node_size / 2; i += dim) { - int left = (i / jump) * jump * 2 + i % jump; + int left = (i / jump) * jump * 2 + i % jump; int right = left + jump; - if (!mask[left] || (mask[right] - && !compare(start[left], start[right]))) { - auto temp = start[left]; - start[left] = start[right]; + if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { + auto temp = start[left]; + start[left] = start[right]; start[right] = temp; auto temp_mask = mask[left]; - mask[left] = mask[right]; - mask[right] = temp_mask; + mask[left] = mask[right]; + mask[right] = temp_mask; } } g.sync(); @@ -383,131 +366,126 @@ __device__ void PBSort(CG const& g, T *start, size_t len, } /** -* Reverses the bits after the most significant set bit in x -* i.e. if x is 1abc..xyz in binary returns 1zyx...cba -* -* @param x The number whose lower bits will be reversed -* @return The number with all bits after the most significant -* set bit reversed -*/ -__device__ int BitReversePerm(int x) { + * Reverses the bits after the most significant set bit in x + * i.e. if x is 1abc..xyz in binary returns 1zyx...cba + * + * @param x The number whose lower bits will be reversed + * @return The number with all bits after the most significant + * set bit reversed + */ +__device__ int BitReversePerm(int x) +{ int clz = __clz(x); - int bits = sizeof(int) * 8; + int bits = sizeof(int) * 8; int high_bit = 1 << ((bits - 1) - clz); - int mask = high_bit - 1; + int mask = high_bit - 1; int masked = x & mask; - int rev = __brev(masked) >> (clz + 1); + int rev = __brev(masked) >> (clz + 1); return high_bit | rev; } /** -* Given x, the idx of a node, return when that node is inserted, -* i.e. if x is 6 and lowest_level_start > 6, return 5 since the node -* at element 6 will be the 5th to be inserted with the bit reversal -* permutation. This operation is its own inverse. -* -* @param x The index to operate on -* @param lowest_level_start Index of the first node in the last level of the -* heap -*/ -__device__ int InsertionOrderIndex(int x, int lowest_level_start) { + * Given x, the idx of a node, return when that node is inserted, + * i.e. if x is 6 and lowest_level_start > 6, return 5 since the node + * at element 6 will be the 5th to be inserted with the bit reversal + * permutation. This operation is its own inverse. + * + * @param x The index to operate on + * @param lowest_level_start Index of the first node in the last level of the + * heap + */ +__device__ int InsertionOrderIndex(int x, int lowest_level_start) +{ assert(x > 0); - if (x >= lowest_level_start) { - return x; - } + if (x >= lowest_level_start) { return x; } return BitReversePerm(x); } /** -* Find the index of the parent of the node at index x -* -* @param x The index to operate on -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @return The index of the parent of x -*/ -__device__ int Parent(int x, int lowest_level_start) { - + * Find the index of the parent of the node at index x + * + * @param x The index to operate on + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @return The index of the parent of x + */ +__device__ int Parent(int x, int lowest_level_start) +{ assert(x > 0); - if (x >= lowest_level_start) { - return BitReversePerm(x) / 2; - } + if (x >= lowest_level_start) { return BitReversePerm(x) / 2; } return x / 2; } /** -* Find the index of the left child of the node at index x -* -* @param x The index to operate on -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @return The index of the left child of x -*/ -__device__ int LeftChild(int x, int lowest_level_start) { + * Find the index of the left child of the node at index x + * + * @param x The index to operate on + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @return The index of the left child of x + */ +__device__ int LeftChild(int x, int lowest_level_start) +{ assert(x > 0); int result = x * 2; - if (result >= lowest_level_start) { - result = BitReversePerm(result); - } + if (result >= lowest_level_start) { result = BitReversePerm(result); } return result; } /** -* Find the index of the right child of the node at index x -* -* @param x The index to operate on -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @return The index of the right child of x -*/ -__device__ int RightChild(int x, int lowest_level_start) { - + * Find the index of the right child of the node at index x + * + * @param x The index to operate on + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @return The index of the right child of x + */ +__device__ int RightChild(int x, int lowest_level_start) +{ assert(x > 0); int result = x * 2 + 1; - if (result >= lowest_level_start) { - result = BitReversePerm(result); - } + if (result >= lowest_level_start) { result = BitReversePerm(result); } return result; } /** -* Swim node cur_node up the heap -* Pre: g must hold the lock corresponding to cur_node -* -* @param g The cooperative group that will perform the operation -* @param cur_node Index of the node to swim -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Swim node cur_node up the heap + * Pre: g must hold the lock corresponding to cur_node + * + * @param g The cooperative group that will perform the operation + * @param cur_node Index of the node to swim + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void Swim(CG const& g, int cur_node, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, + int* locks, int lowest_level_start, SharedMemoryLayout shmem, - Compare const& compare) { - + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); int parent = Parent(cur_node, lowest_level_start); @@ -517,19 +495,19 @@ __device__ void Swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], - heap[parent * node_size + node_size - 1])) { + if (!compare(heap[cur_node * node_size], heap[parent * node_size + node_size - 1])) { ReleaseLock(g, &(locks[parent])); break; } - MergeAndSort(g, &heap[parent * node_size], - &heap[cur_node * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[parent * node_size], + &heap[cur_node * node_size], + shmem.A, + shmem.B, + node_size, + shmem, + compare); g.sync(); @@ -540,49 +518,47 @@ __device__ void Swim(CG const& g, ReleaseLock(g, &(locks[cur_node])); cur_node = parent; - parent = Parent(cur_node, lowest_level_start); + parent = Parent(cur_node, lowest_level_start); } ReleaseLock(g, &(locks[cur_node])); - } /** -* Sink the root down the heap -* Pre: g must hold the root's lock -* -* @param g The cooperative group that will perform the operation -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param node_capacity Max capacity of the heap in nodes -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Sink the root down the heap + * Pre: g must hold the root's lock + * + * @param g The cooperative group that will perform the operation + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param node_capacity Max capacity of the heap in nodes + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void Sink(CG const& g, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, - size_t *p_buffer_size, + int* locks, + size_t* p_buffer_size, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { - + Compare const& compare) +{ size_t cur = kRootIdx; int dim = g.size(); // Sink the node - while (InsertionOrderIndex(LeftChild(cur, lowest_level_start), - lowest_level_start) <= node_capacity) { - - size_t left = LeftChild(cur, lowest_level_start); + while (InsertionOrderIndex(LeftChild(cur, lowest_level_start), lowest_level_start) <= + node_capacity) { + size_t left = LeftChild(cur, lowest_level_start); size_t right = RightChild(cur, lowest_level_start); AcquireLock(g, &locks[left]); @@ -605,14 +581,12 @@ __device__ void Sink(CG const& g, // // If we have both children, merge and sort them if (InsertionOrderIndex(right, lowest_level_start) <= *size) { - size_t hi; // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left+1) * node_size - 1], - heap[(right+1) * node_size - 1])) { + if (!compare(heap[(left + 1) * node_size - 1], heap[(right + 1) * node_size - 1])) { hi = left; lo = right; } else { @@ -622,15 +596,15 @@ __device__ void Sink(CG const& g, // Skip the merge and sort if the nodes are already correctly // sorted - if (!compare(heap[(lo+1) * node_size - 1], - heap[hi * node_size])) { - MergeAndSort(g, &heap[left * node_size], - &heap[right * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + if (!compare(heap[(lo + 1) * node_size - 1], heap[hi * node_size])) { + MergeAndSort(g, + &heap[left * node_size], + &heap[right * node_size], + shmem.A, + shmem.B, + node_size, + shmem, + compare); g.sync(); @@ -648,13 +622,14 @@ __device__ void Sink(CG const& g, lo = left; } - MergeAndSort(g, &heap[lo * node_size], - &heap[cur * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[lo * node_size], + &heap[cur * node_size], + shmem.A, + shmem.B, + node_size, + shmem, + compare); g.sync(); @@ -666,40 +641,38 @@ __device__ void Sink(CG const& g, ReleaseLock(g, &locks[cur]); cur = lo; - } ReleaseLock(g, &locks[cur]); - } /** -* Add exactly node_size elements into the heap from -* elements -* -* @param g The cooperative group that will perform the push -* @param elements Iterator for the elements to be inserted -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Add exactly node_size elements into the heap from + * elements + * + * @param g The cooperative group that will perform the push + * @param elements Iterator for the elements to be inserted + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void PushSingleNode(CG const& g, - InputIt elements, - T *heap, - int *size, - size_t node_size, - int *locks, - int lowest_level_start, - SharedMemoryLayout shmem, - Compare const& compare) { - + InputIt elements, + T* heap, + int* size, + size_t node_size, + int* locks, + int lowest_level_start, + SharedMemoryLayout shmem, + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); CopyPairs(g, shmem.A, elements, elements + node_size); @@ -707,10 +680,8 @@ __device__ void PushSingleNode(CG const& g, PBSort(g, shmem.A, node_size, node_size, shmem.B, compare); - int *cur_node_temp = (int*)shmem.intersections; - if (lane == 0) { - *cur_node_temp = atomicAdd(size, 1) + 1; - } + int* cur_node_temp = (int*)shmem.intersections; + if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); @@ -721,51 +692,47 @@ __device__ void PushSingleNode(CG const& g, g.sync(); - Swim(g, cur_node, heap, size, node_size, locks, - lowest_level_start, shmem, compare); - + Swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); } /** -* Remove exactly node_size elements from the heap and place them -* in elements -* -* @param g The cooperative group that will perform the pop -* @param elements Iterator to the elements to write to -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size Number of pairs in the heap's partial buffer -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param node_capacity Maximum capacity of the heap in nodes -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Remove exactly node_size elements from the heap and place them + * in elements + * + * @param g The cooperative group that will perform the pop + * @param elements Iterator to the elements to write to + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param p_buffer_size Number of pairs in the heap's partial buffer + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param node_capacity Maximum capacity of the heap in nodes + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void PopSingleNode(CG const& g, OutputIt elements, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, - size_t *p_buffer_size, + int* locks, + size_t* p_buffer_size, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { - + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); if (*size == 0) { CopyPairs(g, elements, heap, node_size); - if (lane == 0) { - *p_buffer_size = 0; - } + if (lane == 0) { *p_buffer_size = 0; } g.sync(); return; } @@ -775,29 +742,23 @@ __device__ void PopSingleNode(CG const& g, size_t tar = InsertionOrderIndex(*size, lowest_level_start); - if (tar != 1) { - AcquireLock(g, &locks[tar]); - } + if (tar != 1) { AcquireLock(g, &locks[tar]); } g.sync(); - if (lane == 0) { - *size -= 1; - } + if (lane == 0) { *size -= 1; } g.sync(); // Copy the root to the output array - CopyPairs(g, elements, &heap[node_size], - &heap[node_size] + node_size); + CopyPairs(g, elements, &heap[node_size], &heap[node_size] + node_size); g.sync(); // Copy the target node to the root if (tar != kRootIdx) { - CopyPairs(g, &heap[node_size], &heap[tar * node_size], - node_size); + CopyPairs(g, &heap[node_size], &heap[tar * node_size], node_size); ReleaseLock(g, &locks[tar]); @@ -806,15 +767,16 @@ __device__ void PopSingleNode(CG const& g, // Merge and sort the root and the partial buffer - MergeAndSort(g, &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); @@ -824,43 +786,51 @@ __device__ void PopSingleNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem, compare); - + Sink(g, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } /** -* Remove num_elements < node_size elements from the heap and place them -* in elements -* -* @param elements The array of elements to insert into -* @param num_elements The number of elements to remove -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size Number of pairs in the heap's partial buffer -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param node_capacity Maximum capacity of the heap in nodes -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Remove num_elements < node_size elements from the heap and place them + * in elements + * + * @param elements The array of elements to insert into + * @param num_elements The number of elements to remove + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param p_buffer_size Number of pairs in the heap's partial buffer + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param node_capacity Maximum capacity of the heap in nodes + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void PopPartialNode(CG const& g, InputIt elements, size_t num_elements, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, - size_t *p_buffer_size, + int* locks, + size_t* p_buffer_size, int lowest_level_start, int node_capacity, SharedMemoryLayout shmem, - Compare const& compare) { + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); @@ -876,34 +846,28 @@ __device__ void PopPartialNode(CG const& g, CopyPairs(g, heap, shmem.A, n_p_buffer_size); - if (lane == 0) { - *p_buffer_size = n_p_buffer_size; - } + if (lane == 0) { *p_buffer_size = n_p_buffer_size; } ReleaseLock(g, &locks[kRootIdx]); } else { - CopyPairs(g, elements, &heap[kRootIdx * node_size], num_elements); g.sync(); if (*p_buffer_size >= num_elements) { - - - MergeAndSort(g, &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.A, - shmem.B, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.A, + shmem.B, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); - if (lane == 0) { - *p_buffer_size = *p_buffer_size - num_elements; - } + if (lane == 0) { *p_buffer_size = *p_buffer_size - num_elements; } g.sync(); @@ -912,24 +876,31 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem, compare); + Sink(g, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } else { - - MergeAndSort(g, &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.A, - (T*)nullptr, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.A, + (T*)nullptr, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[kPBufferIdx], shmem.A, - *p_buffer_size + node_size - num_elements); + CopyPairs(g, &heap[kPBufferIdx], shmem.A, *p_buffer_size + node_size - num_elements); int tar = InsertionOrderIndex(*size, lowest_level_start); g.sync(); @@ -939,29 +910,27 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - if (lane == 0) { - *size -= 1; - } + if (lane == 0) { *size -= 1; } if (tar != kRootIdx) { AcquireLock(g, &locks[tar]); - CopyPairs(g, &heap[kRootIdx * node_size], - &heap[tar * node_size], node_size); + CopyPairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], node_size); g.sync(); ReleaseLock(g, &locks[tar]); - MergeAndSort(g, &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); CopyPairs(g, &heap[node_size], shmem.A, node_size); @@ -970,9 +939,16 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - Sink(g, heap, size, node_size, locks, - p_buffer_size, lowest_level_start, node_capacity, shmem, - compare); + Sink(g, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } else { ReleaseLock(g, &locks[kRootIdx]); } @@ -981,37 +957,37 @@ __device__ void PopPartialNode(CG const& g, } /** -* Add p_ins_size < node_size elements into the heap from -* elements -* -* @param g The cooperative group that will perform the push -* @param elements The array of elements to add -* @param p_ins_size The number of elements to be inserted -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size The size of the partial buffer -* @param lowest_level_start Index of the first node in the last level of the -* heap -* @param shmem The shared memory layout for this cooperative group -* @param compare Comparison operator ordering the elements in the heap -*/ + * Add p_ins_size < node_size elements into the heap from + * elements + * + * @param g The cooperative group that will perform the push + * @param elements The array of elements to add + * @param p_ins_size The number of elements to be inserted + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param p_buffer_size The size of the partial buffer + * @param lowest_level_start Index of the first node in the last level of the + * heap + * @param shmem The shared memory layout for this cooperative group + * @param compare Comparison operator ordering the elements in the heap + */ template __device__ void PushPartialNode(CG const& g, InputIt elements, size_t p_ins_size, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, - size_t *p_buffer_size, + int* locks, + size_t* p_buffer_size, int lowest_level_start, SharedMemoryLayout shmem, - Compare const& compare) { - + Compare const& compare) +{ int lane = g.thread_rank(); - int dim = g.size(); + int dim = g.size(); AcquireLock(g, &locks[kRootIdx]); @@ -1022,45 +998,36 @@ __device__ void PushPartialNode(CG const& g, // There is enough data for a new node, in which case we // construct a new node and insert it if (*p_buffer_size + p_ins_size >= node_size) { - - int *cur_node_temp = shmem.intersections; - if (lane == 0) { - *cur_node_temp = atomicAdd(size, 1) + 1; - } + int* cur_node_temp = shmem.intersections; + if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); - if (cur_node != kRootIdx) { - AcquireLock(g, &(locks[cur_node])); - } + if (cur_node != kRootIdx) { AcquireLock(g, &(locks[cur_node])); } g.sync(); - MergeAndSort(g, shmem.B, - &heap[kPBufferIdx], - &heap[cur_node * node_size], - shmem.A, - p_ins_size, - *p_buffer_size, - node_size, - shmem, - compare); + MergeAndSort(g, + shmem.B, + &heap[kPBufferIdx], + &heap[cur_node * node_size], + shmem.A, + p_ins_size, + *p_buffer_size, + node_size, + shmem, + compare); - if (lane == 0) { - *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; - } + if (lane == 0) { *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; } g.sync(); CopyPairs(g, heap, shmem.A, *p_buffer_size); - if (cur_node != kRootIdx) { - ReleaseLock(g, &locks[kRootIdx]); - } + if (cur_node != kRootIdx) { ReleaseLock(g, &locks[kRootIdx]); } - Swim(g, cur_node, heap, size, node_size, - locks, lowest_level_start, shmem, compare); + Swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); } else { // There are not enough elements for a new node, @@ -1068,21 +1035,20 @@ __device__ void PushPartialNode(CG const& g, // the elements to be inserted and then the root // and the partial buffer - MergeAndSort(g, shmem.B, - &heap[kPBufferIdx], - shmem.A, - (T*)nullptr, - p_ins_size, - *p_buffer_size, - node_size, - shmem, - compare); + MergeAndSort(g, + shmem.B, + &heap[kPBufferIdx], + shmem.A, + (T*)nullptr, + p_ins_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); - if (lane == 0) { - *p_buffer_size += p_ins_size; - } + if (lane == 0) { *p_buffer_size += p_ins_size; } g.sync(); @@ -1091,15 +1057,16 @@ __device__ void PushPartialNode(CG const& g, g.sync(); if (*size > 0) { - MergeAndSort(g, &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + MergeAndSort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.A, + shmem.B, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); CopyPairs(g, heap, shmem.B, *p_buffer_size); @@ -1110,7 +1077,6 @@ __device__ void PushPartialNode(CG const& g, } ReleaseLock(g, &locks[kRootIdx]); } - } /** @@ -1131,102 +1097,118 @@ __device__ void PushPartialNode(CG const& g, template __global__ void PushKernel(OutputIt elements, size_t num_elements, - T *heap, - int *size, + T* heap, + int* size, size_t node_size, - int *locks, - size_t *p_buffer_size, + int* locks, + size_t* p_buffer_size, int lowest_level_start, - Compare compare) { - + Compare compare) +{ extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, - blockDim.x, node_size); + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); // We push as many elements as possible as full nodes, // then deal with the remaining elements as a partial insertion // below thread_block g = this_thread_block(); - for (size_t i = blockIdx.x * node_size; - i + node_size <= num_elements; + for (size_t i = blockIdx.x * node_size; i + node_size <= num_elements; i += gridDim.x * node_size) { - PushSingleNode(g, elements + i, heap, size, node_size, locks, - lowest_level_start, shmem, compare); + PushSingleNode( + g, elements + i, heap, size, node_size, locks, lowest_level_start, shmem, compare); } // We only need one block for partial insertion - if (blockIdx.x != 0) { - return; - } + if (blockIdx.x != 0) { return; } // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial insertion - size_t first_not_inserted = (num_elements / node_size) - * node_size; + size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { size_t p_ins_size = num_elements - first_not_inserted; - PushPartialNode(g, elements + first_not_inserted, p_ins_size, - heap, size, node_size, locks, p_buffer_size, - lowest_level_start, shmem, compare); + PushPartialNode(g, + elements + first_not_inserted, + p_ins_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + shmem, + compare); } } /** -* Remove exactly node_size elements from the heap and place them -* in elements -* @param elements The array of elements to insert into -* @param num_elements The number of elements to remove -* @param heap The array of pairs that stores the heap itself -* @param size Pointer to the number of pairs currently in the heap -* @param node_size Size of the nodes in the heap -* @param locks Array of locks, one for each node in the heap -* @param p_buffer_size Number of pairs in the heap's partial buffer -* @param lowest_level_start The first index of the heaps lowest layer -* @param node_capacity The capacity of the heap in nodes -* @param compare Comparison operator ordering the elements in the heap -*/ + * Remove exactly node_size elements from the heap and place them + * in elements + * @param elements The array of elements to insert into + * @param num_elements The number of elements to remove + * @param heap The array of pairs that stores the heap itself + * @param size Pointer to the number of pairs currently in the heap + * @param node_size Size of the nodes in the heap + * @param locks Array of locks, one for each node in the heap + * @param p_buffer_size Number of pairs in the heap's partial buffer + * @param lowest_level_start The first index of the heaps lowest layer + * @param node_capacity The capacity of the heap in nodes + * @param compare Comparison operator ordering the elements in the heap + */ template __global__ void PopKernel(OutputIt elements, - size_t num_elements, - T *heap, - int *size, - size_t node_size, - int *locks, - size_t *p_buffer_size, - int lowest_level_start, - int node_capacity, - Compare compare) { - + size_t num_elements, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + int node_capacity, + Compare compare) +{ extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, - blockDim.x, node_size); + SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); thread_block g = this_thread_block(); for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { - PopSingleNode(g, elements + i * node_size, heap, size, node_size, locks, - p_buffer_size, lowest_level_start, - node_capacity, shmem, compare); + PopSingleNode(g, + elements + i * node_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } // We only need one block for partial deletion - if (blockIdx.x != 0) { - return; - } + if (blockIdx.x != 0) { return; } // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial deletion - size_t first_not_inserted = (num_elements / node_size) - * node_size; + size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { size_t p_del_size = num_elements - first_not_inserted; - PopPartialNode(g, elements + first_not_inserted, - p_del_size, heap, size, node_size, locks, p_buffer_size, - lowest_level_start, node_capacity, shmem, compare); + PopPartialNode(g, + elements + first_not_inserted, + p_del_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } } -} +} // namespace cuco diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 42e15d82d..4bcbfab89 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -1,64 +1,61 @@ #pragma once -#include -#include #include +#include +#include #include namespace cuco { /* -* @brief A GPU-accelerated priority queue of key-value pairs -* -* Allows for multiple concurrent insertions as well as multiple concurrent -* deletions -* -* Current limitations: -* - Does not support insertion and deletion at the same time -* - The implementation of the priority queue is based on -* https://arxiv.org/pdf/1906.06504.pdf, which provides a way to allow -* concurrent insertion and deletion, so this could be added later if useful -* - Capacity is fixed and the queue does not automatically resize -* - Deletion from the queue is much slower than insertion into the queue -* due to congestion at the underlying heap's root node -* -* The queue supports two operations: -* `push`: Add elements into the queue -* `pop`: Remove the element(s) with the lowest (when Max == false) or highest -* (when Max == true) keys -* -* The priority queue supports bulk host-side operations and more fine-grained -* device-side operations. -* -* The host-side bulk operations `push` and `pop` allow an arbitrary number of -* elements to be pushed to or popped from the queue. -* -* The device-side operations allow a cooperative group to push or pop from -* device code. These device side -* operations are invoked with a trivially-copyable device view, -* `device_mutable_view` which can be obtained with the host function -* `get_mutable_device_view` and passed to the device. -* -* @tparam T Type of the elements stored in the queue -* @tparam Compare Comparison operator used to order the elements in the queue -* @tparam FavorInsertionPerformance When true, insertion performance is -* increased at the expense of deletion performance. -* @tparam Allocator Allocator defining how memory is allocated internally -*/ -template , + * @brief A GPU-accelerated priority queue of key-value pairs + * + * Allows for multiple concurrent insertions as well as multiple concurrent + * deletions + * + * Current limitations: + * - Does not support insertion and deletion at the same time + * - The implementation of the priority queue is based on + * https://arxiv.org/pdf/1906.06504.pdf, which provides a way to allow + * concurrent insertion and deletion, so this could be added later if useful + * - Capacity is fixed and the queue does not automatically resize + * - Deletion from the queue is much slower than insertion into the queue + * due to congestion at the underlying heap's root node + * + * The queue supports two operations: + * `push`: Add elements into the queue + * `pop`: Remove the element(s) with the lowest (when Max == false) or highest + * (when Max == true) keys + * + * The priority queue supports bulk host-side operations and more fine-grained + * device-side operations. + * + * The host-side bulk operations `push` and `pop` allow an arbitrary number of + * elements to be pushed to or popped from the queue. + * + * The device-side operations allow a cooperative group to push or pop from + * device code. These device side + * operations are invoked with a trivially-copyable device view, + * `device_mutable_view` which can be obtained with the host function + * `get_mutable_device_view` and passed to the device. + * + * @tparam T Type of the elements stored in the queue + * @tparam Compare Comparison operator used to order the elements in the queue + * @tparam FavorInsertionPerformance When true, insertion performance is + * increased at the expense of deletion performance. + * @tparam Allocator Allocator defining how memory is allocated internally + */ +template , bool FavorInsertionPerformance = false, - typename Allocator = cuco::cuda_allocator> + typename Allocator = cuco::cuda_allocator> class priority_queue { + using int_allocator_type = typename std::allocator_traits::rebind_alloc; - using int_allocator_type = typename std::allocator_traits - ::rebind_alloc; + using t_allocator_type = typename std::allocator_traits::rebind_alloc; - using t_allocator_type = typename std::allocator_traits - ::rebind_alloc; - - using size_t_allocator_type = typename std::allocator_traits - ::rebind_alloc; + using size_t_allocator_type = typename std::allocator_traits::rebind_alloc; const int NodeSize = FavorInsertionPerformance ? 64 : 1024; @@ -69,8 +66,7 @@ class priority_queue { * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage */ - priority_queue(size_t initial_capacity, - Allocator const& alloc = Allocator{}); + priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}); /** * @brief Push elements into the priority queue @@ -99,15 +95,16 @@ class priority_queue { void pop(OutputIt first, OutputIt last, cudaStream_t stream = 0); /* - * @brief Return the amount of shared memory required for operations - * on the queue with a thread block size of block_size - * - * @param block_size Size of the blocks to calculate storage for - * @return The amount of temporary storage required in bytes - */ - int get_shmem_size(int block_size) { + * @brief Return the amount of shared memory required for operations + * on the queue with a thread block size of block_size + * + * @param block_size Size of the blocks to calculate storage for + * @return The amount of temporary storage required in bytes + */ + int get_shmem_size(int block_size) + { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); - int node_bytes = node_size_ * sizeof(T); + int node_bytes = node_size_ * sizeof(T); return intersection_bytes + 2 * node_bytes; } @@ -118,7 +115,6 @@ class priority_queue { class device_mutable_view { public: - /** * @brief Push elements into the priority queue * @@ -132,8 +128,7 @@ class priority_queue { * large enough to hold get_shmem_size(g.size()) bytes */ template - __device__ void push(CG const& g, InputIt first, - InputIt last, void *temp_storage); + __device__ void push(CG const& g, InputIt first, InputIt last, void* temp_storage); /** * @brief Pop elements from the priority queue @@ -148,27 +143,27 @@ class priority_queue { * large enough to hold get_shmem_size(g.size()) bytes */ template - __device__ void pop(CG const& g, OutputIt first, - OutputIt last, void *temp_storage); + __device__ void pop(CG const& g, OutputIt first, OutputIt last, void* temp_storage); /* - * @brief Return the amount of temporary storage required for operations - * on the queue with a cooperative group size of block_size - * - * @param block_size Size of the cooperative groups to calculate storage for - * @return The amount of temporary storage required in bytes - */ - __device__ int get_shmem_size(int block_size) { + * @brief Return the amount of temporary storage required for operations + * on the queue with a cooperative group size of block_size + * + * @param block_size Size of the cooperative groups to calculate storage for + * @return The amount of temporary storage required in bytes + */ + __device__ int get_shmem_size(int block_size) + { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); - int node_bytes = node_size_ * sizeof(T); + int node_bytes = node_size_ * sizeof(T); return intersection_bytes + 2 * node_bytes; } __host__ __device__ device_mutable_view(size_t node_size, - T *d_heap, - int *d_size, - size_t *d_p_buffer_size, - int *d_locks, + T* d_heap, + int* d_size, + size_t* d_p_buffer_size, + int* d_locks, int lowest_level_start, int node_capacity, Compare const& compare) @@ -188,40 +183,46 @@ class priority_queue { int lowest_level_start_; int node_capacity_; - T *d_heap_; - int *d_size_; - size_t *d_p_buffer_size_; - int *d_locks_; + T* d_heap_; + int* d_size_; + size_t* d_p_buffer_size_; + int* d_locks_; Compare compare_; }; /* - * @brief Returns a trivially-copyable class that can be used to perform - * insertion and deletion of single nodes in device code with - * cooperative groups - * - * @return A device view - */ - device_mutable_view get_mutable_device_view() { - return device_mutable_view(node_size_, d_heap_, d_size_, d_p_buffer_size_, - d_locks_, lowest_level_start_, node_capacity_, + * @brief Returns a trivially-copyable class that can be used to perform + * insertion and deletion of single nodes in device code with + * cooperative groups + * + * @return A device view + */ + device_mutable_view get_mutable_device_view() + { + return device_mutable_view(node_size_, + d_heap_, + d_size_, + d_p_buffer_size_, + d_locks_, + lowest_level_start_, + node_capacity_, compare_); } private: - size_t node_size_; ///< Size of the heap's nodes - int lowest_level_start_; ///< Index in `d_heap_` of the first node in the - /// heap's lowest level - int node_capacity_; ///< Capacity of the heap in nodes + size_t node_size_; ///< Size of the heap's nodes + int lowest_level_start_; ///< Index in `d_heap_` of the first node in the + /// heap's lowest level + int node_capacity_; ///< Capacity of the heap in nodes - T *d_heap_; ///< Pointer to an array of nodes, the 0th node + T* d_heap_; ///< Pointer to an array of nodes, the 0th node /// being the heap's partial buffer, and nodes /// 1..(node_capacity_) being the heap, where the /// 1st node is the root - int *d_size_; ///< Number of nodes currently in the heap - size_t *d_p_buffer_size_; ///< Number of elements currently in the partial + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial /// buffer - int *d_locks_; ///< Array of locks where `d_locks_[i]` is the + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the /// lock for the node starting at /// 1d_heap_[node_size * i]` @@ -233,6 +234,6 @@ class priority_queue { Compare compare_{}; }; -} +} // namespace cuco #include diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 11fcfbfcc..e05e47585 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -1,8 +1,8 @@ -#include #include +#include -#include #include +#include #include #include @@ -20,12 +20,14 @@ struct KVPair { }; template -bool __host__ __device__ operator==(const KVPair &a, const KVPair &b) { +bool __host__ __device__ operator==(const KVPair& a, const KVPair& b) +{ return a.first == b.first && a.second == b.second; } template -bool __host__ __device__ operator<(const KVPair &a, const KVPair &b) { +bool __host__ __device__ operator<(const KVPair& a, const KVPair& b) +{ if (a.first == b.first) { return a.second < b.second; } else { @@ -35,20 +37,16 @@ bool __host__ __device__ operator<(const KVPair &a, const KVPair &b) template struct KVLess { - __host__ __device__ bool operator()(const T& a, const T& b) const { - return a.first < b.first; - } + __host__ __device__ bool operator()(const T& a, const T& b) const { return a.first < b.first; } }; template -std::map construct_count_map(std::vector &a) { - +std::map construct_count_map(std::vector& a) +{ std::map result; - for (T &e : a) { - if (result.find(e) == result.end()) { - result.emplace(e, 0); - } + for (T& e : a) { + if (result.find(e) == result.end()) { result.emplace(e, 0); } result[e]++; } @@ -57,17 +55,18 @@ std::map construct_count_map(std::vector &a) { } template -bool is_valid_top_n(std::vector &top_n, std::vector &elements) { - auto top_n_map = construct_count_map(top_n); +bool is_valid_top_n(std::vector& top_n, std::vector& elements) +{ + auto top_n_map = construct_count_map(top_n); auto elements_map = construct_count_map(elements); size_t n = top_n.size(); // 1. Check that the count of each element in the top n is less than or // equal to the count of that element overall in the queue - for (auto &pair : top_n_map) { - if (elements_map.find(pair.first) == elements_map.end() - || elements_map[pair.first] < pair.second) { + for (auto& pair : top_n_map) { + if (elements_map.find(pair.first) == elements_map.end() || + elements_map[pair.first] < pair.second) { return false; } } @@ -78,31 +77,29 @@ bool is_valid_top_n(std::vector &top_n, std::vector &elements) { T max = elements[n - 1]; - for (T &e : top_n) { - if (Compare{}(max, e)) { - return false; - } + for (T& e : top_n) { + if (Compare{}(max, e)) { return false; } } return true; - } template -static void generate_element(T &e, std::mt19937 &gen) { +static void generate_element(T& e, std::mt19937& gen) +{ e = static_cast(gen()); } template -void generate_element - (KVPair &e, std::mt19937 &gen) { +void generate_element(KVPair& e, std::mt19937& gen) +{ generate_element(e.first, gen); generate_element(e.second, gen); } template -static std::vector generate_elements(size_t num_keys) { - +static std::vector generate_elements(size_t num_keys) +{ std::random_device rd; std::mt19937 gen{rd()}; @@ -116,7 +113,8 @@ static std::vector generate_elements(size_t num_keys) { } template -static void insert_to_queue(priority_queue &pq, std::vector &v) { +static void insert_to_queue(priority_queue& pq, std::vector& v) +{ thrust::device_vector d_v(v); pq.push(d_v.begin(), d_v.end()); @@ -125,8 +123,8 @@ static void insert_to_queue(priority_queue &pq, std::vector &v) { } template -static std::vector pop_from_queue(priority_queue &pq, size_t n) { - +static std::vector pop_from_queue(priority_queue& pq, size_t n) +{ thrust::device_vector d_popped(n); pq.pop(d_popped.begin(), d_popped.end()); @@ -137,54 +135,44 @@ static std::vector pop_from_queue(priority_queue &pq, size_t n) { std::vector result(h_popped.size()); - thrust::copy(thrust::host, h_popped.begin(), h_popped.end(), - result.begin()); + thrust::copy(thrust::host, h_popped.begin(), h_popped.end(), result.begin()); return result; - } // Insert elements into the queue and check that they are // all returned when removed from the queue template -bool test_insertion_and_deletion(priority_queue &pq, - std::vector &elements, - size_t n) { - +bool test_insertion_and_deletion(priority_queue& pq, std::vector& elements, size_t n) +{ insert_to_queue(pq, elements); auto popped_elements = pop_from_queue(pq, n); return is_valid_top_n(popped_elements, elements); - } TEST_CASE("Single uint32_t element", "") { - priority_queue pq(1); std::vector els = {1}; REQUIRE(test_insertion_and_deletion(pq, els, 1)); - } TEST_CASE("New node created on partial insertion") { - const size_t kInsertionSize = 600; - const size_t kNumElements = kInsertionSize * 2; + const size_t kNumElements = kInsertionSize * 2; priority_queue pq(kNumElements); std::vector els = generate_elements(kNumElements); - std::vector first_insertion(els.begin(), - els.begin() + kInsertionSize); + std::vector first_insertion(els.begin(), els.begin() + kInsertionSize); - std::vector second_insertion(els.begin() + kInsertionSize, - els.end()); + std::vector second_insertion(els.begin() + kInsertionSize, els.end()); insert_to_queue(pq, first_insertion); @@ -192,18 +180,17 @@ TEST_CASE("New node created on partial insertion") auto popped_elements = pop_from_queue(pq, kInsertionSize); - REQUIRE(is_valid_top_n>(popped_elements, els)); - + REQUIRE(is_valid_top_n>(popped_elements, els)); } -TEST_CASE("Insert, delete, insert, delete", "") { - const size_t kFirstInsertionSize = 100'000; - const size_t kFirstDeletionSize = 10'000; +TEST_CASE("Insert, delete, insert, delete", "") +{ + const size_t kFirstInsertionSize = 100'000; + const size_t kFirstDeletionSize = 10'000; const size_t kSecondInsertionSize = 20'000; - const size_t kSecondDeletionSize = 50'000; - using T = uint32_t; - using Compare = thrust::less; + const size_t kSecondDeletionSize = 50'000; + using T = uint32_t; + using Compare = thrust::less; priority_queue pq(kFirstInsertionSize + kSecondInsertionSize); @@ -224,32 +211,26 @@ TEST_CASE("Insert, delete, insert, delete", "") { std::sort(first_insertion_els.begin(), first_insertion_els.end(), Compare{}); remaining_elements.insert(remaining_elements.end(), - first_insertion_els.begin() + kFirstDeletionSize, - first_insertion_els.end()); - - remaining_elements.insert(remaining_elements.end(), - second_insertion_els.begin(), - second_insertion_els.end()); - - REQUIRE((is_valid_top_n(first_popped_elements, - first_insertion_els) && - is_valid_top_n(second_popped_elements, remaining_elements))); + first_insertion_els.begin() + kFirstDeletionSize, + first_insertion_els.end()); + remaining_elements.insert( + remaining_elements.end(), second_insertion_els.begin(), second_insertion_els.end()); + REQUIRE((is_valid_top_n(first_popped_elements, first_insertion_els) && + is_valid_top_n(second_popped_elements, remaining_elements))); } TEST_CASE("Insertion and deletion on different streams", "") { const size_t kInsertionSize = 100'000; - const size_t kDeletionSize = 10'000; - using T = uint32_t; - using Compare = thrust::less; + const size_t kDeletionSize = 10'000; + using T = uint32_t; + using Compare = thrust::less; auto elements = generate_elements(kInsertionSize * 2); - thrust::device_vector insertion1(elements.begin(), - elements.begin() + kInsertionSize); - thrust::device_vector insertion2(elements.begin() + kInsertionSize, - elements.end()); + thrust::device_vector insertion1(elements.begin(), elements.begin() + kInsertionSize); + thrust::device_vector insertion2(elements.begin() + kInsertionSize, elements.end()); priority_queue pq(kInsertionSize * 2); @@ -278,8 +259,7 @@ TEST_CASE("Insertion and deletion on different streams", "") std::vector popped_elements(h_deletion1.begin(), h_deletion1.end()); - popped_elements.insert(popped_elements.end(), h_deletion2.begin(), - h_deletion2.end()); + popped_elements.insert(popped_elements.end(), h_deletion2.begin(), h_deletion2.end()); REQUIRE(is_valid_top_n(popped_elements, elements)); @@ -288,32 +268,27 @@ TEST_CASE("Insertion and deletion on different streams", "") } template -__global__ void DeviceAPIInsert( - View view, - InputIt begin, - InputIt end) { +__global__ void DeviceAPIInsert(View view, InputIt begin, InputIt end) +{ extern __shared__ int shmem[]; - thread_block g = this_thread_block(); + thread_block g = this_thread_block(); view.push(g, begin, end, shmem); } template -__global__ void DeviceAPIDelete( - View view, - OutputIt begin, - OutputIt end) { - +__global__ void DeviceAPIDelete(View view, OutputIt begin, OutputIt end) +{ extern __shared__ int shmem[]; - thread_block g = this_thread_block(); + thread_block g = this_thread_block(); view.pop(g, begin, end, shmem); } TEST_CASE("Insertion and deletion with Device API", "") { const size_t kInsertionSize = 2000; - const size_t kDeletionSize = 1000; - using T = uint32_t; - using Compare = thrust::less; + const size_t kDeletionSize = 1000; + using T = uint32_t; + using Compare = thrust::less; auto els = generate_elements(kInsertionSize); @@ -322,23 +297,20 @@ TEST_CASE("Insertion and deletion with Device API", "") priority_queue pq(kInsertionSize); const int kBlockSize = 32; - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> - (pq.get_mutable_device_view(), d_els.begin(), d_els.end()); + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>>( + pq.get_mutable_device_view(), d_els.begin(), d_els.end()); cudaDeviceSynchronize(); thrust::device_vector d_pop_result(kDeletionSize); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>> - (pq.get_mutable_device_view(), d_pop_result.begin(), - d_pop_result.end()); + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>>( + pq.get_mutable_device_view(), d_pop_result.begin(), d_pop_result.end()); cudaDeviceSynchronize(); thrust::host_vector h_pop_result(d_pop_result); - std::vector pop_result(h_pop_result.begin(), - h_pop_result.end()); - + std::vector pop_result(h_pop_result.begin(), h_pop_result.end()); REQUIRE(is_valid_top_n(pop_result, els)); } @@ -346,10 +318,10 @@ TEST_CASE("Insertion and deletion with Device API", "") TEST_CASE("Concurrent insertion and deletion with Device API", "") { const size_t kInsertionSize = 1000; - const size_t kDeletionSize = 500; - const int kBlockSize = 32; - using T = uint32_t; - using Compare = thrust::less; + const size_t kDeletionSize = 500; + const int kBlockSize = 32; + using T = uint32_t; + using Compare = thrust::less; auto els = generate_elements(kInsertionSize * 2); @@ -363,11 +335,11 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") cudaStreamCreate(&stream1); cudaStreamCreate(&stream2); - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>> - (pq.get_mutable_device_view(), insertion1.begin(), insertion1.end()); + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>>( + pq.get_mutable_device_view(), insertion1.begin(), insertion1.end()); - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>> - (pq.get_mutable_device_view(), insertion2.begin(), insertion2.end()); + DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>>( + pq.get_mutable_device_view(), insertion2.begin(), insertion2.end()); cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); @@ -375,11 +347,11 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") thrust::device_vector d_deletion1(kDeletionSize); thrust::device_vector d_deletion2(kDeletionSize); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>> - (pq.get_mutable_device_view(), d_deletion1.begin(), d_deletion1.end()); + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>>( + pq.get_mutable_device_view(), d_deletion1.begin(), d_deletion1.end()); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>> - (pq.get_mutable_device_view(), d_deletion2.begin(), d_deletion2.end()); + DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>>( + pq.get_mutable_device_view(), d_deletion2.begin(), d_deletion2.end()); cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); @@ -394,34 +366,27 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") cudaStreamDestroy(stream1); cudaStreamDestroy(stream2); - } -TEMPLATE_TEST_CASE_SIG("N deletions are correct", "", - ((typename T, typename Compare, size_t N, size_t NumKeys), - T, Compare, N, NumKeys), - (uint32_t, thrust::less, 100, 10'000'000), - (uint64_t, thrust::less, 100, 10'000'000), - (KVPair, KVLess>, - 100, 10'000'000), - (uint32_t, thrust::less, 10'000, 10'000'000), - (uint64_t, thrust::less, 10'000, 10'000'000), - (uint64_t, thrust::greater, 10'000, 10'000'000), - (KVPair, KVLess>, - 10'000, 10'000'000), - (KVPair, KVLess>, - 10'000, 10'000'000), - (uint32_t, thrust::less, 10'000'000, 10'000'000), - (uint64_t, thrust::less, 10'000'000, 10'000'000), - (KVPair, KVLess>, - 10'000'000, 10'000'000)) +TEMPLATE_TEST_CASE_SIG( + "N deletions are correct", + "", + ((typename T, typename Compare, size_t N, size_t NumKeys), T, Compare, N, NumKeys), + (uint32_t, thrust::less, 100, 10'000'000), + (uint64_t, thrust::less, 100, 10'000'000), + (KVPair, KVLess>, 100, 10'000'000), + (uint32_t, thrust::less, 10'000, 10'000'000), + (uint64_t, thrust::less, 10'000, 10'000'000), + (uint64_t, thrust::greater, 10'000, 10'000'000), + (KVPair, KVLess>, 10'000, 10'000'000), + (KVPair, KVLess>, 10'000, 10'000'000), + (uint32_t, thrust::less, 10'000'000, 10'000'000), + (uint64_t, thrust::less, 10'000'000, 10'000'000), + (KVPair, KVLess>, 10'000'000, 10'000'000)) { - priority_queue pq(NumKeys); auto els = generate_elements(NumKeys); REQUIRE(test_insertion_and_deletion(pq, els, N)); - } - From 9838569d49dea52b200fe51dbe262268e48e5186 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 30 May 2022 19:41:56 -0700 Subject: [PATCH 29/55] Add copyright to priority_queue_bench.cu Co-authored-by: Yunsong Wang --- .../priority_queue/priority_queue_bench.cu | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index bd4288203..075453116 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include #include From aab4ba0a1d4fbccd7b4eaf1cf48d70b029525c19 Mon Sep 17 00:00:00 2001 From: andrewbriand Date: Mon, 30 May 2022 19:50:51 -0700 Subject: [PATCH 30/55] Add copyright to priority queue files --- include/cuco/detail/priority_queue.inl | 16 ++++++++++++++++ include/cuco/detail/priority_queue_kernels.cuh | 16 ++++++++++++++++ include/cuco/priority_queue.cuh | 16 ++++++++++++++++ tests/priority_queue/priority_queue_test.cu | 16 ++++++++++++++++ 4 files changed, 64 insertions(+) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 78e6352b2..b8d6cedf4 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index e67a9a01f..8e09fa7e4 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 4bcbfab89..49c8e2661 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #pragma once #include diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index e05e47585..5d3945e08 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -1,3 +1,19 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + #include #include From 0196bde1dd3fc2bb589c100c770f266dfc3d8d3f Mon Sep 17 00:00:00 2001 From: andrewbriand Date: Mon, 30 May 2022 20:26:31 -0700 Subject: [PATCH 31/55] Order headers from near to far in priority queue files --- benchmarks/priority_queue/priority_queue_bench.cu | 12 ++++++------ include/cuco/detail/priority_queue.inl | 3 ++- include/cuco/detail/priority_queue_kernels.cuh | 3 ++- include/cuco/priority_queue.cuh | 5 +++-- tests/priority_queue/priority_queue_test.cu | 12 ++++++------ 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 075453116..64e0679ac 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -14,17 +14,17 @@ * limitations under the License. */ -#include -#include -#include - -#include - #include #include #include +#include + +#include +#include +#include + using namespace cuco; template diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index b8d6cedf4..2028e68a8 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -15,11 +15,12 @@ */ #pragma once -#include #include #include +#include + namespace cuco { template diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 8e09fa7e4..3fbae946a 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -16,9 +16,10 @@ #pragma once -#include #include +#include + using namespace cooperative_groups; namespace cuco { diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 49c8e2661..6fbe717bc 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -17,11 +17,12 @@ #pragma once #include -#include -#include #include +#include +#include + namespace cuco { /* diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 5d3945e08..7f92111d8 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -14,18 +14,18 @@ * limitations under the License. */ -#include -#include - -#include -#include +#include #include #include #include -#include +#include +#include + +#include +#include using namespace cuco; From 4af61ca070d15664fa5c9df44c11f7f2661207f7 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Tue, 31 May 2022 04:20:22 +0000 Subject: [PATCH 32/55] Bug fix in priority queue test code --- tests/priority_queue/priority_queue_test.cu | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 7f92111d8..4bf0242d0 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -88,12 +88,14 @@ bool is_valid_top_n(std::vector& top_n, std::vector& elements) } // 2. Check that each element in the top N is not ordered - // after the (n - 1)th element of the sorted list of elements + // after the ith element of the sorted list of elements std::sort(elements.begin(), elements.end(), Compare{}); - T max = elements[n - 1]; + std::sort(top_n.begin(), top_n.end(), Compare{}); - for (T& e : top_n) { + for (int i = 0; i < top_n.size(); i++) { + T max = elements[i]; + T e = top_n[i]; if (Compare{}(max, e)) { return false; } } From a1d074ad5bc7d3c8b08faceb9c83708f6d6704e8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 May 2022 04:20:43 +0000 Subject: [PATCH 33/55] [pre-commit.ci] auto code formatting --- tests/priority_queue/priority_queue_test.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 4bf0242d0..340f077b3 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -95,7 +95,7 @@ bool is_valid_top_n(std::vector& top_n, std::vector& elements) for (int i = 0; i < top_n.size(); i++) { T max = elements[i]; - T e = top_n[i]; + T e = top_n[i]; if (Compare{}(max, e)) { return false; } } From bf930dd87a2715db06bd643a62853de07b82509e Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Tue, 31 May 2022 04:51:56 +0000 Subject: [PATCH 34/55] Remove unnecessary allocator --- include/cuco/detail/priority_queue.inl | 3 +-- include/cuco/priority_queue.cuh | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 2028e68a8..ea395a3c3 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -26,8 +26,7 @@ namespace cuco { template priority_queue::priority_queue( size_t initial_capacity, Allocator const& allocator) - : allocator_{allocator}, - int_allocator_{allocator}, + : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 6fbe717bc..b08477504 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -243,7 +243,6 @@ class priority_queue { /// lock for the node starting at /// 1d_heap_[node_size * i]` - Allocator allocator_; int_allocator_type int_allocator_; t_allocator_type t_allocator_; size_t_allocator_type size_t_allocator_; From 2d9bda958c6bb7643ca312bc2f11c736430e3089 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 May 2022 04:52:06 +0000 Subject: [PATCH 35/55] [pre-commit.ci] auto code formatting --- include/cuco/detail/priority_queue.inl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index ea395a3c3..de2d1ef8b 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -26,9 +26,7 @@ namespace cuco { template priority_queue::priority_queue( size_t initial_capacity, Allocator const& allocator) - : int_allocator_{allocator}, - t_allocator_{allocator}, - size_t_allocator_{allocator} + : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { node_size_ = NodeSize; From 54dc9f3c47e1b81bb221e7ab03dc1f43a57d5199 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sat, 11 Jun 2022 02:52:58 +0000 Subject: [PATCH 36/55] Add missing member docs in priority_queue.cuh --- include/cuco/priority_queue.cuh | 64 +++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index b08477504..1967059cf 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -196,15 +196,23 @@ class priority_queue { } private: - size_t node_size_; - int lowest_level_start_; - int node_capacity_; - - T* d_heap_; - int* d_size_; - size_t* d_p_buffer_size_; - int* d_locks_; - Compare compare_; + size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's + /// in each node) + int lowest_level_start_; ///< Index in `d_heap_` of the first node in the + /// heap's lowest level + int node_capacity_; ///< Capacity of the heap in nodes + + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where + /// the 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` + Compare compare_{}; ///< Comparator used to order the elements in the queue }; /* @@ -227,27 +235,31 @@ class priority_queue { } private: - size_t node_size_; ///< Size of the heap's nodes + size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's + /// in each node) int lowest_level_start_; ///< Index in `d_heap_` of the first node in the /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where the - /// 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// 1d_heap_[node_size * i]` - - int_allocator_type int_allocator_; - t_allocator_type t_allocator_; - size_t_allocator_type size_t_allocator_; - - Compare compare_{}; + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where the + /// 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` + + int_allocator_type int_allocator_; ///< Allocator used to allocated ints + /// for example, the lock array + t_allocator_type t_allocator_; ///< Allocator used to allocate T's + /// and therefore nodes + size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate + /// size_t's, e.g. d_p_buffer_size_ + + Compare compare_{}; ///< Comparator used to order the elements in the queue }; } // namespace cuco From a5c169d4820d2226a2c1cfb67e65046de8886984 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Jun 2022 02:53:11 +0000 Subject: [PATCH 37/55] [pre-commit.ci] auto code formatting --- include/cuco/priority_queue.cuh | 56 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 1967059cf..5b0750d33 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -202,17 +202,17 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where - /// the 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` - Compare compare_{}; ///< Comparator used to order the elements in the queue + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where + /// the 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` + Compare compare_{}; ///< Comparator used to order the elements in the queue }; /* @@ -241,25 +241,25 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where the - /// 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where the + /// 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` - int_allocator_type int_allocator_; ///< Allocator used to allocated ints - /// for example, the lock array - t_allocator_type t_allocator_; ///< Allocator used to allocate T's - /// and therefore nodes - size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate - /// size_t's, e.g. d_p_buffer_size_ + int_allocator_type int_allocator_; ///< Allocator used to allocated ints + /// for example, the lock array + t_allocator_type t_allocator_; ///< Allocator used to allocate T's + /// and therefore nodes + size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate + /// size_t's, e.g. d_p_buffer_size_ - Compare compare_{}; ///< Comparator used to order the elements in the queue + Compare compare_{}; ///< Comparator used to order the elements in the queue }; } // namespace cuco From 4269e9c4dce144ba95ed95dd98cb2713d77bc1ea Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sat, 11 Jun 2022 03:28:51 +0000 Subject: [PATCH 38/55] Add stream parameter to priority queue ctor --- include/cuco/detail/priority_queue.inl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index de2d1ef8b..410927162 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -25,7 +25,7 @@ namespace cuco { template priority_queue::priority_queue( - size_t initial_capacity, Allocator const& allocator) + size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { node_size_ = NodeSize; @@ -40,11 +40,11 @@ priority_queue::priority_queue d_size_ = std::allocator_traits::allocate(int_allocator_, 1); - CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); + CUCO_CUDA_TRY(cudaMemsetAsync(d_size_, 0, sizeof(int), stream)); d_p_buffer_size_ = std::allocator_traits::allocate(size_t_allocator_, 1); - CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); + CUCO_CUDA_TRY(cudaMemsetAsync(d_p_buffer_size_, 0, sizeof(size_t), stream)); d_heap_ = std::allocator_traits::allocate( t_allocator_, node_capacity_ * node_size_ + node_size_); @@ -52,7 +52,8 @@ priority_queue::priority_queue d_locks_ = std::allocator_traits::allocate(int_allocator_, node_capacity_ + 1); - CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, sizeof(int) * (node_capacity_ + 1))); + CUCO_CUDA_TRY(cudaMemsetAsync(d_locks_, 0, sizeof(int) * (node_capacity_ + 1), + stream)); } template From 30cbf836b3178c966749dab6694afb1073674f1c Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 12 Jun 2022 05:47:36 +0000 Subject: [PATCH 39/55] Snake case in priority queue files --- include/cuco/detail/priority_queue.inl | 130 ++-- .../cuco/detail/priority_queue_kernels.cuh | 608 +++++++++--------- include/cuco/priority_queue.cuh | 59 +- 3 files changed, 405 insertions(+), 392 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 410927162..58ebdc8aa 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -76,15 +76,15 @@ void priority_queue::push(Inpu const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, max(1, (int)((last - first) / node_size_))); - PushKernel<<>>(first, - last - first, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - compare_); + push_kernel<<>>(first, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } @@ -101,16 +101,16 @@ void priority_queue::pop(Outpu const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, max(1, (int)((pop_size - partial) / node_size_))); - PopKernel<<>>(first, - pop_size, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - compare_); + pop_kernel<<>>(first, + pop_size, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } @@ -121,33 +121,34 @@ __device__ void priority_queue::device_mutable_view::push( CG const& g, InputIt first, InputIt last, void* temp_storage) { - SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); + shared_memory_layout shmem = + get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto push_size = last - first; for (size_t i = 0; i < push_size / node_size_; i++) { - PushSingleNode(g, - first + i * node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - lowest_level_start_, - shmem, - compare_); + push_single_node(g, + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + lowest_level_start_, + shmem, + compare_); } if (push_size % node_size_ != 0) { - PushPartialNode(g, - first + (push_size / node_size_) * node_size_, - push_size % node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - shmem, - compare_); + push_partial_node(g, + first + (push_size / node_size_) * node_size_, + push_size % node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + shmem, + compare_); } } @@ -157,36 +158,37 @@ __device__ void priority_queue::device_mutable_view::pop( CG const& g, OutputIt first, OutputIt last, void* temp_storage) { - SharedMemoryLayout shmem = GetSharedMemoryLayout((int*)temp_storage, g.size(), node_size_); + shared_memory_layout shmem = + get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto pop_size = last - first; for (size_t i = 0; i < pop_size / node_size_; i++) { - PopSingleNode(g, - first + i * node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - shmem, - compare_); + pop_single_node(g, + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } if (pop_size % node_size_ != 0) { - PopPartialNode(g, - first + (pop_size / node_size_) * node_size_, - last - first, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - shmem, - compare_); + pop_partial_node(g, + first + (pop_size / node_size_) * node_size_, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } } diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 3fbae946a..612c91be4 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -33,10 +33,10 @@ constexpr int kRootIdx = 1; * Ideally, this temp storage is in shared memory */ template -struct SharedMemoryLayout { +struct shared_memory_layout { int* intersections; - T* A; - T* B; + T* a; + T* b; }; /* @@ -50,12 +50,13 @@ struct SharedMemoryLayout { * @returns The memory layout for the given group dimension and node size */ template -__device__ SharedMemoryLayout GetSharedMemoryLayout(int* s, int dim, size_t node_size) +__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, + size_t node_size) { - SharedMemoryLayout result; + shared_memory_layout result; result.intersections = s; - result.A = (T*)(s + 2 * (dim + 1)); - result.B = result.A + node_size; + result.a = (T*)(s + 2 * (dim + 1)); + result.b = result.a + node_size; return result; } @@ -67,7 +68,7 @@ __device__ SharedMemoryLayout GetSharedMemoryLayout(int* s, int dim, size_t n * @param l Pointer to the lock to be acquired */ template -__device__ void AcquireLock(CG const& g, int* l) +__device__ void acquire_lock(CG const& g, int* l) { if (g.thread_rank() == 0) { while (atomicCAS(l, 0, 1) != 0) @@ -84,7 +85,7 @@ __device__ void AcquireLock(CG const& g, int* l) * @param l Pointer to the lock to be released */ template -__device__ void ReleaseLock(CG const& g, int* l) +__device__ void release_lock(CG const& g, int* l) { if (g.thread_rank() == 0) { atomicExch(l, 0); } } @@ -98,10 +99,12 @@ __device__ void ReleaseLock(CG const& g, int* l) * @param src_end Iterator to the end of the source array */ template -__device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, InputIt2 src_end) +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, + InputIt2 src_end) { auto dst = dst_start + g.thread_rank(); - for (auto src = src_start + g.thread_rank(); src < src_end; dst += g.size(), src += g.size()) { + for (auto src = src_start + g.thread_rank(); src < src_end; + dst += g.size(), src += g.size()) { *dst = *src; } } @@ -115,9 +118,10 @@ __device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, I * @param num_pairs Number of pairs to copy */ template -__device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, size_t num_pairs) +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, + size_t num_pairs) { - CopyPairs(g, dst_start, src_start, src_start + num_pairs); + copy_pairs(g, dst_start, src_start, src_start + num_pairs); } /** @@ -137,16 +141,16 @@ __device__ void CopyPairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, s * @param compare Comparison operator ordering the elements to be merged */ template -__device__ void MergeAndSort(CG const& g, - T* a, - T* b, - T* lo, - T* hi, - size_t node_size, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void merge_and_sort(CG const& g, + T* a, + T* b, + T* lo, + T* hi, + size_t node_size, + shared_memory_layout shmem, + Compare const& compare) { - MergeAndSort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); + merge_and_sort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); } /** @@ -173,31 +177,31 @@ __device__ void MergeAndSort(CG const& g, * @param compare Comparison operator ordering the elements to be merged */ template -__device__ void MergeAndSort(CG const& g, - T* a, - T* b, - T* lo, - T* hi, - size_t num_elements_a, - size_t num_elements_b, - size_t node_size, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void merge_and_sort(CG const& g, + T* a, + T* b, + T* lo, + T* hi, + size_t num_elements_a, + size_t num_elements_b, + size_t node_size, + shared_memory_layout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); if (num_elements_a == node_size && compare(a[node_size - 1], b[0])) { - CopyPairs(g, lo, a, num_elements_a); + copy_pairs(g, lo, a, num_elements_a); - CopyPairs(g, hi, b, num_elements_b); + copy_pairs(g, hi, b, num_elements_b); return; } if (num_elements_b == node_size && compare(b[node_size - 1], a[0])) { - CopyPairs(g, hi, a, num_elements_a); + copy_pairs(g, hi, a, num_elements_a); - CopyPairs(g, lo, b, num_elements_b); + copy_pairs(g, lo, b, num_elements_b); return; } @@ -317,7 +321,7 @@ __device__ void MergeAndSort(CG const& g, * @param compare Comparison operator ordering the elements to be sorted */ template -__device__ void PBSort( +__device__ void pb_sort( CG const& g, T* start, size_t len, size_t node_size, T* temp, Compare const& compare) { int lane = g.thread_rank(); @@ -338,7 +342,8 @@ __device__ void PBSort( int left = (i / jump) * jump * 2 + i % jump; int right = left + jump; if ((i / start_jump) % 2 == 0) { - if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { + if (!mask[left] || + (mask[right] && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -348,7 +353,8 @@ __device__ void PBSort( mask[right] = temp_mask; } } else { - if (!mask[right] || (mask[left] && compare(start[left], start[right]))) { + if (!mask[right] || + (mask[left] && compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -390,7 +396,7 @@ __device__ void PBSort( * @return The number with all bits after the most significant * set bit reversed */ -__device__ int BitReversePerm(int x) +__device__ int bit_reverse_perm(int x) { int clz = __clz(x); @@ -414,13 +420,13 @@ __device__ int BitReversePerm(int x) * @param lowest_level_start Index of the first node in the last level of the * heap */ -__device__ int InsertionOrderIndex(int x, int lowest_level_start) +__device__ int insertion_order_index(int x, int lowest_level_start) { assert(x > 0); if (x >= lowest_level_start) { return x; } - return BitReversePerm(x); + return bit_reverse_perm(x); } /** @@ -431,10 +437,10 @@ __device__ int InsertionOrderIndex(int x, int lowest_level_start) * heap * @return The index of the parent of x */ -__device__ int Parent(int x, int lowest_level_start) +__device__ int parent(int x, int lowest_level_start) { assert(x > 0); - if (x >= lowest_level_start) { return BitReversePerm(x) / 2; } + if (x >= lowest_level_start) { return bit_reverse_perm(x) / 2; } return x / 2; } @@ -447,12 +453,12 @@ __device__ int Parent(int x, int lowest_level_start) * heap * @return The index of the left child of x */ -__device__ int LeftChild(int x, int lowest_level_start) +__device__ int left_child(int x, int lowest_level_start) { assert(x > 0); int result = x * 2; - if (result >= lowest_level_start) { result = BitReversePerm(result); } + if (result >= lowest_level_start) { result = bit_reverse_perm(result); } return result; } @@ -465,18 +471,18 @@ __device__ int LeftChild(int x, int lowest_level_start) * heap * @return The index of the right child of x */ -__device__ int RightChild(int x, int lowest_level_start) +__device__ int right_child(int x, int lowest_level_start) { assert(x > 0); int result = x * 2 + 1; - if (result >= lowest_level_start) { result = BitReversePerm(result); } + if (result >= lowest_level_start) { result = bit_reverse_perm(result); } return result; } /** - * Swim node cur_node up the heap + * swim node cur_node up the heap * Pre: g must hold the lock corresponding to cur_node * * @param g The cooperative group that will perform the operation @@ -491,58 +497,59 @@ __device__ int RightChild(int x, int lowest_level_start) * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void Swim(CG const& g, +__device__ void swim(CG const& g, int cur_node, T* heap, int* size, size_t node_size, int* locks, int lowest_level_start, - SharedMemoryLayout shmem, + shared_memory_layout shmem, Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); - int parent = Parent(cur_node, lowest_level_start); + int cur_parent = parent(cur_node, lowest_level_start); - // Swim the new node up the tree + // swim the new node up the tree while (cur_node != 1) { - AcquireLock(g, &(locks[parent])); + acquire_lock(g, &(locks[cur_parent])); // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], heap[parent * node_size + node_size - 1])) { - ReleaseLock(g, &(locks[parent])); + if (!compare(heap[cur_node * node_size], + heap[cur_parent * node_size + node_size - 1])) { + release_lock(g, &(locks[cur_parent])); break; } - MergeAndSort(g, - &heap[parent * node_size], - &heap[cur_node * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[cur_parent * node_size], + &heap[cur_node * node_size], + shmem.a, + shmem.b, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[parent * node_size], shmem.A, node_size); - CopyPairs(g, &heap[cur_node * node_size], shmem.B, node_size); + copy_pairs(g, &heap[cur_parent * node_size], shmem.a, node_size); + copy_pairs(g, &heap[cur_node * node_size], shmem.b, node_size); g.sync(); - ReleaseLock(g, &(locks[cur_node])); - cur_node = parent; - parent = Parent(cur_node, lowest_level_start); + release_lock(g, &(locks[cur_node])); + cur_node = cur_parent; + cur_parent = parent(cur_node, lowest_level_start); } - ReleaseLock(g, &(locks[cur_node])); + release_lock(g, &(locks[cur_node])); } /** - * Sink the root down the heap + * sink the root down the heap * Pre: g must hold the root's lock * * @param g The cooperative group that will perform the operation @@ -557,7 +564,7 @@ __device__ void Swim(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void Sink(CG const& g, +__device__ void sink(CG const& g, T* heap, int* size, size_t node_size, @@ -565,45 +572,46 @@ __device__ void Sink(CG const& g, size_t* p_buffer_size, int lowest_level_start, int node_capacity, - SharedMemoryLayout shmem, + shared_memory_layout shmem, Compare const& compare) { size_t cur = kRootIdx; int dim = g.size(); - // Sink the node - while (InsertionOrderIndex(LeftChild(cur, lowest_level_start), lowest_level_start) <= - node_capacity) { - size_t left = LeftChild(cur, lowest_level_start); - size_t right = RightChild(cur, lowest_level_start); + // sink the node + while (insertion_order_index(left_child(cur, lowest_level_start), + lowest_level_start) <= node_capacity) { + size_t left = left_child(cur, lowest_level_start); + size_t right = right_child(cur, lowest_level_start); - AcquireLock(g, &locks[left]); + acquire_lock(g, &locks[left]); // The left node might have been removed // since the while loop condition, in which // case we are already at the bottom of the heap - if (InsertionOrderIndex(left, lowest_level_start) > *size) { - ReleaseLock(g, &locks[left]); + if (insertion_order_index(left, lowest_level_start) > *size) { + release_lock(g, &locks[left]); break; } size_t lo; - if (InsertionOrderIndex(right, lowest_level_start) <= node_capacity) { - AcquireLock(g, &locks[right]); + if (insertion_order_index(right, lowest_level_start) <= node_capacity) { + acquire_lock(g, &locks[right]); // Note that even with the bit reversal permutation, // we can never have a right child without a left child // // If we have both children, merge and sort them - if (InsertionOrderIndex(right, lowest_level_start) <= *size) { + if (insertion_order_index(right, lowest_level_start) <= *size) { size_t hi; // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left + 1) * node_size - 1], heap[(right + 1) * node_size - 1])) { + if (!compare(heap[(left + 1) * node_size - 1], + heap[(right + 1) * node_size - 1])) { hi = left; lo = right; } else { @@ -614,52 +622,52 @@ __device__ void Sink(CG const& g, // Skip the merge and sort if the nodes are already correctly // sorted if (!compare(heap[(lo + 1) * node_size - 1], heap[hi * node_size])) { - MergeAndSort(g, - &heap[left * node_size], - &heap[right * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[left * node_size], + &heap[right * node_size], + shmem.a, + shmem.b, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[hi * node_size], shmem.B, node_size); - CopyPairs(g, &heap[lo * node_size], shmem.A, node_size); + copy_pairs(g, &heap[hi * node_size], shmem.b, node_size); + copy_pairs(g, &heap[lo * node_size], shmem.a, node_size); g.sync(); } - ReleaseLock(g, &locks[hi]); + release_lock(g, &locks[hi]); } else { lo = left; - ReleaseLock(g, &locks[right]); + release_lock(g, &locks[right]); } } else { lo = left; } - MergeAndSort(g, - &heap[lo * node_size], - &heap[cur * node_size], - shmem.A, - shmem.B, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[lo * node_size], + &heap[cur * node_size], + shmem.a, + shmem.b, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[lo * node_size], shmem.B, node_size); - CopyPairs(g, &heap[cur * node_size], shmem.A, node_size); + copy_pairs(g, &heap[lo * node_size], shmem.b, node_size); + copy_pairs(g, &heap[cur * node_size], shmem.a, node_size); g.sync(); - ReleaseLock(g, &locks[cur]); + release_lock(g, &locks[cur]); cur = lo; } - ReleaseLock(g, &locks[cur]); + release_lock(g, &locks[cur]); } /** @@ -678,38 +686,39 @@ __device__ void Sink(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void PushSingleNode(CG const& g, - InputIt elements, - T* heap, - int* size, - size_t node_size, - int* locks, - int lowest_level_start, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void push_single_node(CG const& g, + InputIt elements, + T* heap, + int* size, + size_t node_size, + int* locks, + int lowest_level_start, + shared_memory_layout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); - CopyPairs(g, shmem.A, elements, elements + node_size); + copy_pairs(g, shmem.a, elements, elements + node_size); g.sync(); - PBSort(g, shmem.A, node_size, node_size, shmem.B, compare); + pb_sort(g, shmem.a, node_size, node_size, shmem.b, compare); int* cur_node_temp = (int*)shmem.intersections; if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); - int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); + int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); - AcquireLock(g, &(locks[cur_node])); + acquire_lock(g, &(locks[cur_node])); - CopyPairs(g, &heap[cur_node * node_size], shmem.A, node_size); + copy_pairs(g, &heap[cur_node * node_size], shmem.a, node_size); g.sync(); - Swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); + swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, + shmem, compare); } /** @@ -730,24 +739,24 @@ __device__ void PushSingleNode(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void PopSingleNode(CG const& g, - OutputIt elements, - T* heap, - int* size, - size_t node_size, - int* locks, - size_t* p_buffer_size, - int lowest_level_start, - int node_capacity, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void pop_single_node(CG const& g, + OutputIt elements, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + int node_capacity, + shared_memory_layout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); - AcquireLock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[kRootIdx]); if (*size == 0) { - CopyPairs(g, elements, heap, node_size); + copy_pairs(g, elements, heap, node_size); if (lane == 0) { *p_buffer_size = 0; } g.sync(); @@ -757,9 +766,9 @@ __device__ void PopSingleNode(CG const& g, // Find the target node (the last one inserted) and // decrement the size - size_t tar = InsertionOrderIndex(*size, lowest_level_start); + size_t tar = insertion_order_index(*size, lowest_level_start); - if (tar != 1) { AcquireLock(g, &locks[tar]); } + if (tar != 1) { acquire_lock(g, &locks[tar]); } g.sync(); @@ -768,42 +777,42 @@ __device__ void PopSingleNode(CG const& g, // Copy the root to the output array - CopyPairs(g, elements, &heap[node_size], &heap[node_size] + node_size); + copy_pairs(g, elements, &heap[node_size], &heap[node_size] + node_size); g.sync(); // Copy the target node to the root if (tar != kRootIdx) { - CopyPairs(g, &heap[node_size], &heap[tar * node_size], node_size); + copy_pairs(g, &heap[node_size], &heap[tar * node_size], node_size); - ReleaseLock(g, &locks[tar]); + release_lock(g, &locks[tar]); g.sync(); } // Merge and sort the root and the partial buffer - MergeAndSort(g, - &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.a, + shmem.b, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[node_size], shmem.A, node_size); + copy_pairs(g, &heap[node_size], shmem.a, node_size); - CopyPairs(g, heap, shmem.B, *p_buffer_size); + copy_pairs(g, heap, shmem.b, *p_buffer_size); g.sync(); - Sink(g, + sink(g, heap, size, node_size, @@ -833,49 +842,49 @@ __device__ void PopSingleNode(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void PopPartialNode(CG const& g, - InputIt elements, - size_t num_elements, - T* heap, - int* size, - size_t node_size, - int* locks, - size_t* p_buffer_size, - int lowest_level_start, - int node_capacity, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void pop_partial_node(CG const& g, + InputIt elements, + size_t num_elements, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + int node_capacity, + shared_memory_layout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); - AcquireLock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[kRootIdx]); if (*size == 0) { - CopyPairs(g, elements, heap, num_elements); + copy_pairs(g, elements, heap, num_elements); g.sync(); size_t n_p_buffer_size = *p_buffer_size - num_elements; - CopyPairs(g, shmem.A, heap + num_elements, n_p_buffer_size); + copy_pairs(g, shmem.a, heap + num_elements, n_p_buffer_size); g.sync(); - CopyPairs(g, heap, shmem.A, n_p_buffer_size); + copy_pairs(g, heap, shmem.a, n_p_buffer_size); if (lane == 0) { *p_buffer_size = n_p_buffer_size; } - ReleaseLock(g, &locks[kRootIdx]); + release_lock(g, &locks[kRootIdx]); } else { - CopyPairs(g, elements, &heap[kRootIdx * node_size], num_elements); + copy_pairs(g, elements, &heap[kRootIdx * node_size], num_elements); g.sync(); if (*p_buffer_size >= num_elements) { - MergeAndSort(g, + merge_and_sort(g, &heap[kPBufferIdx], &heap[kRootIdx * node_size] + num_elements, - shmem.A, - shmem.B, + shmem.a, + shmem.b, *p_buffer_size, node_size - num_elements, node_size, @@ -888,12 +897,12 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - CopyPairs(g, &heap[kRootIdx * node_size], shmem.A, node_size); - CopyPairs(g, &heap[kPBufferIdx], shmem.B, *p_buffer_size); + copy_pairs(g, &heap[kRootIdx * node_size], shmem.a, node_size); + copy_pairs(g, &heap[kPBufferIdx], shmem.b, *p_buffer_size); g.sync(); - Sink(g, + sink(g, heap, size, node_size, @@ -904,10 +913,10 @@ __device__ void PopPartialNode(CG const& g, shmem, compare); } else { - MergeAndSort(g, + merge_and_sort(g, &heap[kPBufferIdx], &heap[kRootIdx * node_size] + num_elements, - shmem.A, + shmem.a, (T*)nullptr, *p_buffer_size, node_size - num_elements, @@ -917,9 +926,9 @@ __device__ void PopPartialNode(CG const& g, g.sync(); - CopyPairs(g, &heap[kPBufferIdx], shmem.A, *p_buffer_size + node_size - num_elements); + copy_pairs(g, &heap[kPBufferIdx], shmem.a, *p_buffer_size + node_size - num_elements); - int tar = InsertionOrderIndex(*size, lowest_level_start); + int tar = insertion_order_index(*size, lowest_level_start); g.sync(); *p_buffer_size += node_size; @@ -930,33 +939,33 @@ __device__ void PopPartialNode(CG const& g, if (lane == 0) { *size -= 1; } if (tar != kRootIdx) { - AcquireLock(g, &locks[tar]); + acquire_lock(g, &locks[tar]); - CopyPairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], node_size); + copy_pairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], node_size); g.sync(); - ReleaseLock(g, &locks[tar]); + release_lock(g, &locks[tar]); - MergeAndSort(g, - &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.a, + shmem.b, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, &heap[node_size], shmem.A, node_size); + copy_pairs(g, &heap[node_size], shmem.a, node_size); - CopyPairs(g, heap, shmem.B, *p_buffer_size); + copy_pairs(g, heap, shmem.b, *p_buffer_size); g.sync(); - Sink(g, + sink(g, heap, size, node_size, @@ -967,7 +976,7 @@ __device__ void PopPartialNode(CG const& g, shmem, compare); } else { - ReleaseLock(g, &locks[kRootIdx]); + release_lock(g, &locks[kRootIdx]); } } } @@ -991,26 +1000,26 @@ __device__ void PopPartialNode(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__device__ void PushPartialNode(CG const& g, - InputIt elements, - size_t p_ins_size, - T* heap, - int* size, - size_t node_size, - int* locks, - size_t* p_buffer_size, - int lowest_level_start, - SharedMemoryLayout shmem, - Compare const& compare) +__device__ void push_partial_node(CG const& g, + InputIt elements, + size_t p_ins_size, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + shared_memory_layout shmem, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); - AcquireLock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[kRootIdx]); - CopyPairs(g, shmem.B, elements, p_ins_size); + copy_pairs(g, shmem.b, elements, p_ins_size); - PBSort(g, shmem.B, p_ins_size, node_size, shmem.A, compare); + pb_sort(g, shmem.b, p_ins_size, node_size, shmem.a, compare); // There is enough data for a new node, in which case we // construct a new node and insert it @@ -1019,32 +1028,32 @@ __device__ void PushPartialNode(CG const& g, if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); - int cur_node = InsertionOrderIndex(*cur_node_temp, lowest_level_start); + int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); - if (cur_node != kRootIdx) { AcquireLock(g, &(locks[cur_node])); } + if (cur_node != kRootIdx) { acquire_lock(g, &(locks[cur_node])); } g.sync(); - MergeAndSort(g, - shmem.B, - &heap[kPBufferIdx], - &heap[cur_node * node_size], - shmem.A, - p_ins_size, - *p_buffer_size, - node_size, - shmem, - compare); + merge_and_sort(g, + shmem.b, + &heap[kPBufferIdx], + &heap[cur_node * node_size], + shmem.a, + p_ins_size, + *p_buffer_size, + node_size, + shmem, + compare); if (lane == 0) { *p_buffer_size = (*p_buffer_size + p_ins_size) - node_size; } g.sync(); - CopyPairs(g, heap, shmem.A, *p_buffer_size); + copy_pairs(g, heap, shmem.a, *p_buffer_size); - if (cur_node != kRootIdx) { ReleaseLock(g, &locks[kRootIdx]); } + if (cur_node != kRootIdx) { release_lock(g, &locks[kRootIdx]); } - Swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); + swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); } else { // There are not enough elements for a new node, @@ -1052,16 +1061,16 @@ __device__ void PushPartialNode(CG const& g, // the elements to be inserted and then the root // and the partial buffer - MergeAndSort(g, - shmem.B, - &heap[kPBufferIdx], - shmem.A, - (T*)nullptr, - p_ins_size, - *p_buffer_size, - node_size, - shmem, - compare); + merge_and_sort(g, + shmem.b, + &heap[kPBufferIdx], + shmem.a, + (T*)nullptr, + p_ins_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); @@ -1069,30 +1078,30 @@ __device__ void PushPartialNode(CG const& g, g.sync(); - CopyPairs(g, heap, shmem.A, *p_buffer_size); + copy_pairs(g, heap, shmem.a, *p_buffer_size); g.sync(); if (*size > 0) { - MergeAndSort(g, - &heap[node_size], - &heap[kPBufferIdx], - shmem.A, - shmem.B, - node_size, - *p_buffer_size, - node_size, - shmem, - compare); + merge_and_sort(g, + &heap[node_size], + &heap[kPBufferIdx], + shmem.a, + shmem.b, + node_size, + *p_buffer_size, + node_size, + shmem, + compare); g.sync(); - CopyPairs(g, heap, shmem.B, *p_buffer_size); + copy_pairs(g, heap, shmem.b, *p_buffer_size); - CopyPairs(g, &heap[node_size], shmem.A, node_size); + copy_pairs(g, &heap[node_size], shmem.a, node_size); g.sync(); } - ReleaseLock(g, &locks[kRootIdx]); + release_lock(g, &locks[kRootIdx]); } } @@ -1112,19 +1121,19 @@ __device__ void PushPartialNode(CG const& g, * @param compare Comparison operator ordering the elements in the heap */ template -__global__ void PushKernel(OutputIt elements, - size_t num_elements, - T* heap, - int* size, - size_t node_size, - int* locks, - size_t* p_buffer_size, - int lowest_level_start, - Compare compare) +__global__ void push_kernel(OutputIt elements, + size_t num_elements, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + Compare compare) { extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); + shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); // We push as many elements as possible as full nodes, // then deal with the remaining elements as a partial insertion @@ -1132,7 +1141,7 @@ __global__ void PushKernel(OutputIt elements, thread_block g = this_thread_block(); for (size_t i = blockIdx.x * node_size; i + node_size <= num_elements; i += gridDim.x * node_size) { - PushSingleNode( + push_single_node( g, elements + i, heap, size, node_size, locks, lowest_level_start, shmem, compare); } @@ -1145,7 +1154,7 @@ __global__ void PushKernel(OutputIt elements, if (first_not_inserted < num_elements) { size_t p_ins_size = num_elements - first_not_inserted; - PushPartialNode(g, + push_partial_node(g, elements + first_not_inserted, p_ins_size, heap, @@ -1174,34 +1183,35 @@ __global__ void PushKernel(OutputIt elements, * @param compare Comparison operator ordering the elements in the heap */ template -__global__ void PopKernel(OutputIt elements, - size_t num_elements, - T* heap, - int* size, - size_t node_size, - int* locks, - size_t* p_buffer_size, - int lowest_level_start, - int node_capacity, - Compare compare) +__global__ void pop_kernel(OutputIt elements, + size_t num_elements, + T* heap, + int* size, + size_t node_size, + int* locks, + size_t* p_buffer_size, + int lowest_level_start, + int node_capacity, + Compare compare) { extern __shared__ int s[]; - SharedMemoryLayout shmem = GetSharedMemoryLayout(s, blockDim.x, node_size); + shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, + node_size); thread_block g = this_thread_block(); for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { - PopSingleNode(g, - elements + i * node_size, - heap, - size, - node_size, - locks, - p_buffer_size, - lowest_level_start, - node_capacity, - shmem, - compare); + pop_single_node(g, + elements + i * node_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } // We only need one block for partial deletion @@ -1213,18 +1223,18 @@ __global__ void PopKernel(OutputIt elements, if (first_not_inserted < num_elements) { size_t p_del_size = num_elements - first_not_inserted; - PopPartialNode(g, - elements + first_not_inserted, - p_del_size, - heap, - size, - node_size, - locks, - p_buffer_size, - lowest_level_start, - node_capacity, - shmem, - compare); + pop_partial_node(g, + elements + first_not_inserted, + p_del_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + node_capacity, + shmem, + compare); } } diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 5b0750d33..59a7cee03 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -83,7 +83,8 @@ class priority_queue { * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage */ - priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}); + priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}, + cudaStream_t stream = 0); /** * @brief Push elements into the priority queue @@ -202,17 +203,17 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where - /// the 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` - Compare compare_{}; ///< Comparator used to order the elements in the queue + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where + /// the 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` + Compare compare_{}; ///< Comparator used to order the elements in the queue }; /* @@ -241,25 +242,25 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where the - /// 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where the + /// 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + size_t* d_p_buffer_size_; ///< Number of elements currently in the partial + /// buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` - int_allocator_type int_allocator_; ///< Allocator used to allocated ints - /// for example, the lock array - t_allocator_type t_allocator_; ///< Allocator used to allocate T's - /// and therefore nodes - size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate - /// size_t's, e.g. d_p_buffer_size_ + int_allocator_type int_allocator_; ///< Allocator used to allocated ints + /// for example, the lock array + t_allocator_type t_allocator_; ///< Allocator used to allocate T's + /// and therefore nodes + size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate + /// size_t's, e.g. d_p_buffer_size_ - Compare compare_{}; ///< Comparator used to order the elements in the queue + Compare compare_{}; ///< Comparator used to order the elements in the queue }; } // namespace cuco From bec63f32a489069ebbc4281704d8bda71136b3c7 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 12 Jun 2022 20:04:11 +0000 Subject: [PATCH 40/55] Put priority queue kernels in detail namespace --- include/cuco/detail/priority_queue.inl | 20 +++++++++---------- .../cuco/detail/priority_queue_kernels.cuh | 10 ++++++---- tests/priority_queue/priority_queue_test.cu | 5 +++-- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 58ebdc8aa..2d5de37fb 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -76,7 +76,7 @@ void priority_queue::push(Inpu const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, max(1, (int)((last - first) / node_size_))); - push_kernel<<>>(first, + detail::push_kernel<<>>(first, last - first, d_heap_, d_size_, @@ -101,7 +101,7 @@ void priority_queue::pop(Outpu const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, max(1, (int)((pop_size - partial) / node_size_))); - pop_kernel<<>>(first, + detail::pop_kernel<<>>(first, pop_size, d_heap_, d_size_, @@ -121,12 +121,12 @@ __device__ void priority_queue::device_mutable_view::push( CG const& g, InputIt first, InputIt last, void* temp_storage) { - shared_memory_layout shmem = - get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); + detail::shared_memory_layout shmem = + detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto push_size = last - first; for (size_t i = 0; i < push_size / node_size_; i++) { - push_single_node(g, + detail::push_single_node(g, first + i * node_size_, d_heap_, d_size_, @@ -138,7 +138,7 @@ priority_queue::device_mutable } if (push_size % node_size_ != 0) { - push_partial_node(g, + detail::push_partial_node(g, first + (push_size / node_size_) * node_size_, push_size % node_size_, d_heap_, @@ -158,12 +158,12 @@ __device__ void priority_queue::device_mutable_view::pop( CG const& g, OutputIt first, OutputIt last, void* temp_storage) { - shared_memory_layout shmem = - get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); + detail::shared_memory_layout shmem = + detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto pop_size = last - first; for (size_t i = 0; i < pop_size / node_size_; i++) { - pop_single_node(g, + detail::pop_single_node(g, first + i * node_size_, d_heap_, d_size_, @@ -177,7 +177,7 @@ priority_queue::device_mutable } if (pop_size % node_size_ != 0) { - pop_partial_node(g, + detail::pop_partial_node(g, first + (pop_size / node_size_) * node_size_, last - first, d_heap_, diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 612c91be4..4192d8033 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -20,9 +20,9 @@ #include -using namespace cooperative_groups; - namespace cuco { +namespace detail { +namespace cg = cooperative_groups; constexpr int kPBufferIdx = 0; constexpr int kRootIdx = 1; @@ -1138,7 +1138,7 @@ __global__ void push_kernel(OutputIt elements, // We push as many elements as possible as full nodes, // then deal with the remaining elements as a partial insertion // below - thread_block g = this_thread_block(); + cg::thread_block g = cg::this_thread_block(); for (size_t i = blockIdx.x * node_size; i + node_size <= num_elements; i += gridDim.x * node_size) { push_single_node( @@ -1199,7 +1199,7 @@ __global__ void pop_kernel(OutputIt elements, shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); - thread_block g = this_thread_block(); + cg::thread_block g = cg::this_thread_block(); for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { pop_single_node(g, elements + i * node_size, @@ -1238,4 +1238,6 @@ __global__ void pop_kernel(OutputIt elements, } } +} // namespace detail + } // namespace cuco diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 340f077b3..6b0430877 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -28,6 +28,7 @@ #include using namespace cuco; +namespace cg = cooperative_groups; template struct KVPair { @@ -289,7 +290,7 @@ template __global__ void DeviceAPIInsert(View view, InputIt begin, InputIt end) { extern __shared__ int shmem[]; - thread_block g = this_thread_block(); + cg::thread_block g = cg::this_thread_block(); view.push(g, begin, end, shmem); } @@ -297,7 +298,7 @@ template __global__ void DeviceAPIDelete(View view, OutputIt begin, OutputIt end) { extern __shared__ int shmem[]; - thread_block g = this_thread_block(); + cg::thread_block g = cg::this_thread_block(); view.pop(g, begin, end, shmem); } From aa124042d1136a715d18f2c3c1879c31c30fe3b3 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 13 Jun 2022 01:52:34 +0000 Subject: [PATCH 41/55] generate_keys_uniform -> generate_kv_pairs_uniform --- benchmarks/priority_queue/priority_queue_bench.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 64e0679ac..8af8280fc 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -33,7 +33,8 @@ struct pair_less { }; template -static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) +static void generate_kv_pairs_uniform(OutputIt output_begin, + OutputIt output_end) { std::random_device rd; std::mt19937 gen{rd()}; @@ -55,7 +56,7 @@ static void BM_insert(::benchmark::State& state) NumKeys); std::vector> h_pairs(NumKeys); - generate_keys_uniform(h_pairs.begin(), h_pairs.end()); + generate_kv_pairs_uniform(h_pairs.begin(), h_pairs.end()); thrust::device_vector> d_pairs(h_pairs); state.ResumeTiming(); @@ -74,7 +75,7 @@ static void BM_delete(::benchmark::State& state) NumKeys); std::vector> h_pairs(NumKeys); - generate_keys_uniform(h_pairs.begin(), h_pairs.end()); + generate_kv_pairs_uniform(h_pairs.begin(), h_pairs.end()); thrust::device_vector> d_pairs(h_pairs); pq.push(d_pairs.begin(), d_pairs.end()); From 55cf2e6bb21c5258de7504af1e61dba00ae7f07c Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Mon, 13 Jun 2022 02:21:05 +0000 Subject: [PATCH 42/55] Remove FavorInsertionPerformance template parameter --- include/cuco/detail/priority_queue.inl | 34 +++++++++++++------------- include/cuco/priority_queue.cuh | 5 ---- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 2d5de37fb..3a966c8a3 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -23,12 +23,12 @@ namespace cuco { -template -priority_queue::priority_queue( +template +priority_queue::priority_queue( size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { - node_size_ = NodeSize; + node_size_ = 64; // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); @@ -56,8 +56,8 @@ priority_queue::priority_queue stream)); } -template -priority_queue::~priority_queue() +template +priority_queue::~priority_queue() { std::allocator_traits::deallocate(int_allocator_, d_size_, 1); std::allocator_traits::deallocate(size_t_allocator_, d_p_buffer_size_, 1); @@ -67,11 +67,11 @@ priority_queue::~priority_queu int_allocator_, d_locks_, node_capacity_ + 1); } -template +template template -void priority_queue::push(InputIt first, - InputIt last, - cudaStream_t stream) +void priority_queue::push(InputIt first, + InputIt last, + cudaStream_t stream) { const int kBlockSize = min(256, (int)node_size_); const int kNumBlocks = min(64000, max(1, (int)((last - first) / node_size_))); @@ -89,11 +89,11 @@ void priority_queue::push(Inpu CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template -void priority_queue::pop(OutputIt first, - OutputIt last, - cudaStream_t stream) +void priority_queue::pop(OutputIt first, + OutputIt last, + cudaStream_t stream) { int pop_size = last - first; const int partial = pop_size % node_size_; @@ -115,10 +115,10 @@ void priority_queue::pop(Outpu CUCO_CUDA_TRY(cudaGetLastError()); } -template +template template __device__ void -priority_queue::device_mutable_view::push( +priority_queue::device_mutable_view::push( CG const& g, InputIt first, InputIt last, void* temp_storage) { detail::shared_memory_layout shmem = @@ -152,10 +152,10 @@ priority_queue::device_mutable } } -template +template template __device__ void -priority_queue::device_mutable_view::pop( +priority_queue::device_mutable_view::pop( CG const& g, OutputIt first, OutputIt last, void* temp_storage) { detail::shared_memory_layout shmem = diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 59a7cee03..fd3d521fd 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -59,13 +59,10 @@ namespace cuco { * * @tparam T Type of the elements stored in the queue * @tparam Compare Comparison operator used to order the elements in the queue - * @tparam FavorInsertionPerformance When true, insertion performance is - * increased at the expense of deletion performance. * @tparam Allocator Allocator defining how memory is allocated internally */ template , - bool FavorInsertionPerformance = false, typename Allocator = cuco::cuda_allocator> class priority_queue { using int_allocator_type = typename std::allocator_traits::rebind_alloc; @@ -74,8 +71,6 @@ class priority_queue { using size_t_allocator_type = typename std::allocator_traits::rebind_alloc; - const int NodeSize = FavorInsertionPerformance ? 64 : 1024; - public: /** * @brief Construct a priority queue From f4814dbe5df96c06798ec13a8cfa671673c8b6ba Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 15 Jun 2022 04:17:31 +0000 Subject: [PATCH 43/55] Default node size 64 -> 1024 --- include/cuco/detail/priority_queue.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 3a966c8a3..6cbf3f3b0 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -28,7 +28,7 @@ priority_queue::priority_queue( size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { - node_size_ = 64; + node_size_ = 1024; // Round up to the nearest multiple of node size int nodes = ((initial_capacity + node_size_ - 1) / node_size_); From 89eea1899e37086e1b063824d2698574dde149be Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 15 Jun 2022 04:31:44 +0000 Subject: [PATCH 44/55] Avoid c-style expressions in priority queue files --- include/cuco/detail/priority_queue.inl | 72 +++++++++-------- .../cuco/detail/priority_queue_kernels.cuh | 78 ++++++++++--------- include/cuco/priority_queue.cuh | 18 ++--- 3 files changed, 88 insertions(+), 80 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 6cbf3f3b0..c03ac02d1 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -25,7 +25,7 @@ namespace cuco { template priority_queue::priority_queue( - size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) + std::size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { node_size_ = 1024; @@ -34,7 +34,7 @@ priority_queue::priority_queue( int nodes = ((initial_capacity + node_size_ - 1) / node_size_); node_capacity_ = nodes; - lowest_level_start_ = 1 << (int)log2(nodes); + lowest_level_start_ = 1 << static_cast(std::log2(nodes)); // Allocate device variables @@ -44,7 +44,7 @@ priority_queue::priority_queue( d_p_buffer_size_ = std::allocator_traits::allocate(size_t_allocator_, 1); - CUCO_CUDA_TRY(cudaMemsetAsync(d_p_buffer_size_, 0, sizeof(size_t), stream)); + CUCO_CUDA_TRY(cudaMemsetAsync(d_p_buffer_size_, 0, sizeof(std::size_t), stream)); d_heap_ = std::allocator_traits::allocate( t_allocator_, node_capacity_ * node_size_ + node_size_); @@ -73,18 +73,21 @@ void priority_queue::push(InputIt first, InputIt last, cudaStream_t stream) { - const int kBlockSize = min(256, (int)node_size_); - const int kNumBlocks = min(64000, max(1, (int)((last - first) / node_size_))); - - detail::push_kernel<<>>(first, - last - first, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - compare_); + const int block_size = 256; + + const int num_nodes = static_cast((last - first) / node_size_) + 1; + const int num_blocks = std::min(64000, num_nodes); + + detail::push_kernel<<>>(first, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } @@ -95,22 +98,25 @@ void priority_queue::pop(OutputIt first, OutputIt last, cudaStream_t stream) { - int pop_size = last - first; - const int partial = pop_size % node_size_; - - const int kBlockSize = min(256, (int)node_size_); - const int kNumBlocks = min(64000, max(1, (int)((pop_size - partial) / node_size_))); - - detail::pop_kernel<<>>(first, - pop_size, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - compare_); + + + const int block_size = 256; + const int pop_size = last - first; + + const int num_nodes = static_cast(pop_size / node_size_) + 1; + const int num_blocks = std::min(64000, num_nodes); + + detail::pop_kernel<<>>(first, + pop_size, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } @@ -125,7 +131,7 @@ priority_queue::device_mutable_view::push( detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto push_size = last - first; - for (size_t i = 0; i < push_size / node_size_; i++) { + for (std::size_t i = 0; i < push_size / node_size_; i++) { detail::push_single_node(g, first + i * node_size_, d_heap_, @@ -162,7 +168,7 @@ priority_queue::device_mutable_view::pop( detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto pop_size = last - first; - for (size_t i = 0; i < pop_size / node_size_; i++) { + for (std::size_t i = 0; i < pop_size / node_size_; i++) { detail::pop_single_node(g, first + i * node_size_, d_heap_, diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 4192d8033..3e139f68d 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -51,7 +51,7 @@ struct shared_memory_layout { */ template __device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, - size_t node_size) + std::size_t node_size) { shared_memory_layout result; result.intersections = s; @@ -119,7 +119,7 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, */ template __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, - size_t num_pairs) + std::size_t num_pairs) { copy_pairs(g, dst_start, src_start, src_start + num_pairs); } @@ -146,7 +146,7 @@ __device__ void merge_and_sort(CG const& g, T* b, T* lo, T* hi, - size_t node_size, + std::size_t node_size, shared_memory_layout shmem, Compare const& compare) { @@ -182,9 +182,9 @@ __device__ void merge_and_sort(CG const& g, T* b, T* lo, T* hi, - size_t num_elements_a, - size_t num_elements_b, - size_t node_size, + std::size_t num_elements_a, + std::size_t num_elements_b, + std::size_t node_size, shared_memory_layout shmem, Compare const& compare) { @@ -322,7 +322,8 @@ __device__ void merge_and_sort(CG const& g, */ template __device__ void pb_sort( - CG const& g, T* start, size_t len, size_t node_size, T* temp, Compare const& compare) + CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, + Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -501,7 +502,7 @@ __device__ void swim(CG const& g, int cur_node, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, int lowest_level_start, shared_memory_layout shmem, @@ -567,23 +568,23 @@ template __device__ void sink(CG const& g, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, int node_capacity, shared_memory_layout shmem, Compare const& compare) { - size_t cur = kRootIdx; + std::size_t cur = kRootIdx; int dim = g.size(); // sink the node while (insertion_order_index(left_child(cur, lowest_level_start), lowest_level_start) <= node_capacity) { - size_t left = left_child(cur, lowest_level_start); - size_t right = right_child(cur, lowest_level_start); + std::size_t left = left_child(cur, lowest_level_start); + std::size_t right = right_child(cur, lowest_level_start); acquire_lock(g, &locks[left]); @@ -595,7 +596,7 @@ __device__ void sink(CG const& g, break; } - size_t lo; + std::size_t lo; if (insertion_order_index(right, lowest_level_start) <= node_capacity) { acquire_lock(g, &locks[right]); @@ -605,7 +606,7 @@ __device__ void sink(CG const& g, // // If we have both children, merge and sort them if (insertion_order_index(right, lowest_level_start) <= *size) { - size_t hi; + std::size_t hi; // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child @@ -690,7 +691,7 @@ __device__ void push_single_node(CG const& g, InputIt elements, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, int lowest_level_start, shared_memory_layout shmem, @@ -743,9 +744,9 @@ __device__ void pop_single_node(CG const& g, OutputIt elements, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, int node_capacity, shared_memory_layout shmem, @@ -766,7 +767,7 @@ __device__ void pop_single_node(CG const& g, // Find the target node (the last one inserted) and // decrement the size - size_t tar = insertion_order_index(*size, lowest_level_start); + std::size_t tar = insertion_order_index(*size, lowest_level_start); if (tar != 1) { acquire_lock(g, &locks[tar]); } @@ -844,12 +845,12 @@ __device__ void pop_single_node(CG const& g, template __device__ void pop_partial_node(CG const& g, InputIt elements, - size_t num_elements, + std::size_t num_elements, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, int node_capacity, shared_memory_layout shmem, @@ -864,7 +865,7 @@ __device__ void pop_partial_node(CG const& g, copy_pairs(g, elements, heap, num_elements); g.sync(); - size_t n_p_buffer_size = *p_buffer_size - num_elements; + std::size_t n_p_buffer_size = *p_buffer_size - num_elements; copy_pairs(g, shmem.a, heap + num_elements, n_p_buffer_size); @@ -1002,12 +1003,12 @@ __device__ void pop_partial_node(CG const& g, template __device__ void push_partial_node(CG const& g, InputIt elements, - size_t p_ins_size, + std::size_t p_ins_size, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, shared_memory_layout shmem, Compare const& compare) @@ -1122,12 +1123,12 @@ __device__ void push_partial_node(CG const& g, */ template __global__ void push_kernel(OutputIt elements, - size_t num_elements, + std::size_t num_elements, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, Compare compare) { @@ -1139,7 +1140,7 @@ __global__ void push_kernel(OutputIt elements, // then deal with the remaining elements as a partial insertion // below cg::thread_block g = cg::this_thread_block(); - for (size_t i = blockIdx.x * node_size; i + node_size <= num_elements; + for (std::size_t i = blockIdx.x * node_size; i + node_size <= num_elements; i += gridDim.x * node_size) { push_single_node( g, elements + i, heap, size, node_size, locks, lowest_level_start, shmem, compare); @@ -1150,10 +1151,10 @@ __global__ void push_kernel(OutputIt elements, // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial insertion - size_t first_not_inserted = (num_elements / node_size) * node_size; + std::size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { - size_t p_ins_size = num_elements - first_not_inserted; + std::size_t p_ins_size = num_elements - first_not_inserted; push_partial_node(g, elements + first_not_inserted, p_ins_size, @@ -1184,12 +1185,12 @@ __global__ void push_kernel(OutputIt elements, */ template __global__ void pop_kernel(OutputIt elements, - size_t num_elements, + std::size_t num_elements, T* heap, int* size, - size_t node_size, + std::size_t node_size, int* locks, - size_t* p_buffer_size, + std::size_t* p_buffer_size, int lowest_level_start, int node_capacity, Compare compare) @@ -1200,7 +1201,8 @@ __global__ void pop_kernel(OutputIt elements, node_size); cg::thread_block g = cg::this_thread_block(); - for (size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { + for (std::size_t i = blockIdx.x; i < num_elements / node_size; + i += gridDim.x) { pop_single_node(g, elements + i * node_size, heap, @@ -1219,10 +1221,10 @@ __global__ void pop_kernel(OutputIt elements, // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial deletion - size_t first_not_inserted = (num_elements / node_size) * node_size; + std::size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { - size_t p_del_size = num_elements - first_not_inserted; + std::size_t p_del_size = num_elements - first_not_inserted; pop_partial_node(g, elements + first_not_inserted, p_del_size, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index fd3d521fd..6066315e4 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -78,7 +78,7 @@ class priority_queue { * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage */ - priority_queue(size_t initial_capacity, Allocator const& alloc = Allocator{}, + priority_queue(std::size_t initial_capacity, Allocator const& alloc = Allocator{}, cudaStream_t stream = 0); /** @@ -172,10 +172,10 @@ class priority_queue { return intersection_bytes + 2 * node_bytes; } - __host__ __device__ device_mutable_view(size_t node_size, + __host__ __device__ device_mutable_view(std::size_t node_size, T* d_heap, int* d_size, - size_t* d_p_buffer_size, + std::size_t* d_p_buffer_size, int* d_locks, int lowest_level_start, int node_capacity, @@ -192,7 +192,7 @@ class priority_queue { } private: - size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's + std::size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's /// in each node) int lowest_level_start_; ///< Index in `d_heap_` of the first node in the /// heap's lowest level @@ -203,8 +203,8 @@ class priority_queue { /// 1..(node_capacity_) being the heap, where /// the 1st node is the root int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer + std::size_t* d_p_buffer_size_; ///< Number of elements currently in the + /// partial buffer int* d_locks_; ///< Array of locks where `d_locks_[i]` is the /// lock for the node starting at /// d_heap_[node_size * i]` @@ -231,7 +231,7 @@ class priority_queue { } private: - size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's + std::size_t node_size_; ///< Size of the heap's nodes (i.e. number of T's /// in each node) int lowest_level_start_; ///< Index in `d_heap_` of the first node in the /// heap's lowest level @@ -242,8 +242,8 @@ class priority_queue { /// 1..(node_capacity_) being the heap, where the /// 1st node is the root int* d_size_; ///< Number of nodes currently in the heap - size_t* d_p_buffer_size_; ///< Number of elements currently in the partial - /// buffer + std::size_t* d_p_buffer_size_; ///< Number of elements currently in the + /// partial buffer int* d_locks_; ///< Array of locks where `d_locks_[i]` is the /// lock for the node starting at /// d_heap_[node_size * i]` From 7d4720016c33a00c473c0c0a17609e18ae1d0399 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Wed, 15 Jun 2022 04:32:27 +0000 Subject: [PATCH 45/55] Remove FavorInsertionPerformance in priority queue benchmark --- .../priority_queue/priority_queue_bench.cu | 26 ++++++------------- 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 8af8280fc..a4eda8a1e 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -46,14 +46,13 @@ static void generate_kv_pairs_uniform(OutputIt output_begin, } } -template +template static void BM_insert(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>, FavorInsertionPerformance> pq( - NumKeys); + priority_queue, pair_less>> pq(NumKeys); std::vector> h_pairs(NumKeys); generate_kv_pairs_uniform(h_pairs.begin(), h_pairs.end()); @@ -65,14 +64,13 @@ static void BM_insert(::benchmark::State& state) } } -template +template static void BM_delete(::benchmark::State& state) { for (auto _ : state) { state.PauseTiming(); - priority_queue, pair_less>, FavorInsertionPerformance> pq( - NumKeys); + priority_queue, pair_less>> pq(NumKeys); std::vector> h_pairs(NumKeys); generate_kv_pairs_uniform(h_pairs.begin(), h_pairs.end()); @@ -87,18 +85,10 @@ static void BM_delete(::benchmark::State& state) } } -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false)->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false)->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false)->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000)->Unit(benchmark::kMillisecond); -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false)->Unit(benchmark::kMillisecond); - -BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true)->Unit(benchmark::kMillisecond); - -BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true)->Unit(benchmark::kMillisecond); - -BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true)->Unit(benchmark::kMillisecond); - -BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true)->Unit(benchmark::kMillisecond); +BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000)->Unit(benchmark::kMillisecond); From 007316a890f1c95fa8aaa2ff05e8503e784a9797 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Jun 2022 04:33:43 +0000 Subject: [PATCH 46/55] [pre-commit.ci] auto code formatting --- .../priority_queue/priority_queue_bench.cu | 3 +- include/cuco/detail/priority_queue.inl | 162 +++++++++--------- .../cuco/detail/priority_queue_kernels.cuh | 115 ++++++------- include/cuco/priority_queue.cuh | 65 +++---- 4 files changed, 166 insertions(+), 179 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index a4eda8a1e..261a5a6ad 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -33,8 +33,7 @@ struct pair_less { }; template -static void generate_kv_pairs_uniform(OutputIt output_begin, - OutputIt output_end) +static void generate_kv_pairs_uniform(OutputIt output_begin, OutputIt output_end) { std::random_device rd; std::mt19937 gen{rd()}; diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index c03ac02d1..375dcc0de 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -24,8 +24,9 @@ namespace cuco { template -priority_queue::priority_queue( - std::size_t initial_capacity, Allocator const& allocator, cudaStream_t stream) +priority_queue::priority_queue(std::size_t initial_capacity, + Allocator const& allocator, + cudaStream_t stream) : int_allocator_{allocator}, t_allocator_{allocator}, size_t_allocator_{allocator} { node_size_ = 1024; @@ -52,8 +53,7 @@ priority_queue::priority_queue( d_locks_ = std::allocator_traits::allocate(int_allocator_, node_capacity_ + 1); - CUCO_CUDA_TRY(cudaMemsetAsync(d_locks_, 0, sizeof(int) * (node_capacity_ + 1), - stream)); + CUCO_CUDA_TRY(cudaMemsetAsync(d_locks_, 0, sizeof(int) * (node_capacity_ + 1), stream)); } template @@ -69,132 +69,128 @@ priority_queue::~priority_queue() template template -void priority_queue::push(InputIt first, - InputIt last, - cudaStream_t stream) +void priority_queue::push(InputIt first, InputIt last, cudaStream_t stream) { const int block_size = 256; - const int num_nodes = static_cast((last - first) / node_size_) + 1; + const int num_nodes = static_cast((last - first) / node_size_) + 1; const int num_blocks = std::min(64000, num_nodes); - detail::push_kernel<<>>(first, - last - first, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - compare_); + detail::push_kernel<<>>( + first, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } template template -void priority_queue::pop(OutputIt first, - OutputIt last, - cudaStream_t stream) +void priority_queue::pop(OutputIt first, OutputIt last, cudaStream_t stream) { - - const int block_size = 256; - const int pop_size = last - first; + const int pop_size = last - first; - const int num_nodes = static_cast(pop_size / node_size_) + 1; + const int num_nodes = static_cast(pop_size / node_size_) + 1; const int num_blocks = std::min(64000, num_nodes); - detail::pop_kernel<<>>(first, - pop_size, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - compare_); + detail::pop_kernel<<>>( + first, + pop_size, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + compare_); CUCO_CUDA_TRY(cudaGetLastError()); } template template -__device__ void -priority_queue::device_mutable_view::push( - CG const& g, InputIt first, InputIt last, void* temp_storage) +__device__ void priority_queue::device_mutable_view::push(CG const& g, + InputIt first, + InputIt last, + void* temp_storage) { detail::shared_memory_layout shmem = - detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); + detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto push_size = last - first; for (std::size_t i = 0; i < push_size / node_size_; i++) { detail::push_single_node(g, - first + i * node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - lowest_level_start_, - shmem, - compare_); + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + lowest_level_start_, + shmem, + compare_); } if (push_size % node_size_ != 0) { detail::push_partial_node(g, - first + (push_size / node_size_) * node_size_, - push_size % node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - shmem, - compare_); + first + (push_size / node_size_) * node_size_, + push_size % node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + shmem, + compare_); } } template template -__device__ void -priority_queue::device_mutable_view::pop( - CG const& g, OutputIt first, OutputIt last, void* temp_storage) +__device__ void priority_queue::device_mutable_view::pop(CG const& g, + OutputIt first, + OutputIt last, + void* temp_storage) { detail::shared_memory_layout shmem = - detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); + detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); auto pop_size = last - first; for (std::size_t i = 0; i < pop_size / node_size_; i++) { detail::pop_single_node(g, - first + i * node_size_, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - shmem, - compare_); + first + i * node_size_, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } if (pop_size % node_size_ != 0) { detail::pop_partial_node(g, - first + (pop_size / node_size_) * node_size_, - last - first, - d_heap_, - d_size_, - node_size_, - d_locks_, - d_p_buffer_size_, - lowest_level_start_, - node_capacity_, - shmem, - compare_); + first + (pop_size / node_size_) * node_size_, + last - first, + d_heap_, + d_size_, + node_size_, + d_locks_, + d_p_buffer_size_, + lowest_level_start_, + node_capacity_, + shmem, + compare_); } } diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 3e139f68d..977653a45 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -50,8 +50,7 @@ struct shared_memory_layout { * @returns The memory layout for the given group dimension and node size */ template -__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, - std::size_t node_size) +__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, std::size_t node_size) { shared_memory_layout result; result.intersections = s; @@ -99,12 +98,10 @@ __device__ void release_lock(CG const& g, int* l) * @param src_end Iterator to the end of the source array */ template -__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, - InputIt2 src_end) +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, InputIt2 src_end) { auto dst = dst_start + g.thread_rank(); - for (auto src = src_start + g.thread_rank(); src < src_end; - dst += g.size(), src += g.size()) { + for (auto src = src_start + g.thread_rank(); src < src_end; dst += g.size(), src += g.size()) { *dst = *src; } } @@ -118,7 +115,9 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, * @param num_pairs Number of pairs to copy */ template -__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, +__device__ void copy_pairs(CG const& g, + InputIt1 dst_start, + InputIt2 src_start, std::size_t num_pairs) { copy_pairs(g, dst_start, src_start, src_start + num_pairs); @@ -142,13 +141,13 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, */ template __device__ void merge_and_sort(CG const& g, - T* a, - T* b, - T* lo, - T* hi, - std::size_t node_size, - shared_memory_layout shmem, - Compare const& compare) + T* a, + T* b, + T* lo, + T* hi, + std::size_t node_size, + shared_memory_layout shmem, + Compare const& compare) { merge_and_sort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); } @@ -322,8 +321,7 @@ __device__ void merge_and_sort(CG const& g, */ template __device__ void pb_sort( - CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, - Compare const& compare) + CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, Compare const& compare) { int lane = g.thread_rank(); int dim = g.size(); @@ -343,8 +341,7 @@ __device__ void pb_sort( int left = (i / jump) * jump * 2 + i % jump; int right = left + jump; if ((i / start_jump) % 2 == 0) { - if (!mask[left] || - (mask[right] && !compare(start[left], start[right]))) { + if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -354,8 +351,7 @@ __device__ void pb_sort( mask[right] = temp_mask; } } else { - if (!mask[right] || - (mask[left] && compare(start[left], start[right]))) { + if (!mask[right] || (mask[left] && compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -519,8 +515,7 @@ __device__ void swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], - heap[cur_parent * node_size + node_size - 1])) { + if (!compare(heap[cur_node * node_size], heap[cur_parent * node_size + node_size - 1])) { release_lock(g, &(locks[cur_parent])); break; } @@ -542,8 +537,8 @@ __device__ void swim(CG const& g, g.sync(); release_lock(g, &(locks[cur_node])); - cur_node = cur_parent; - cur_parent = parent(cur_node, lowest_level_start); + cur_node = cur_parent; + cur_parent = parent(cur_node, lowest_level_start); } release_lock(g, &(locks[cur_node])); @@ -581,8 +576,8 @@ __device__ void sink(CG const& g, int dim = g.size(); // sink the node - while (insertion_order_index(left_child(cur, lowest_level_start), - lowest_level_start) <= node_capacity) { + while (insertion_order_index(left_child(cur, lowest_level_start), lowest_level_start) <= + node_capacity) { std::size_t left = left_child(cur, lowest_level_start); std::size_t right = right_child(cur, lowest_level_start); @@ -611,8 +606,7 @@ __device__ void sink(CG const& g, // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left + 1) * node_size - 1], - heap[(right + 1) * node_size - 1])) { + if (!compare(heap[(left + 1) * node_size - 1], heap[(right + 1) * node_size - 1])) { hi = left; lo = right; } else { @@ -718,8 +712,7 @@ __device__ void push_single_node(CG const& g, g.sync(); - swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, - shmem, compare); + swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); } /** @@ -882,15 +875,15 @@ __device__ void pop_partial_node(CG const& g, if (*p_buffer_size >= num_elements) { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.a, - shmem.b, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + shmem.b, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); @@ -915,15 +908,15 @@ __device__ void pop_partial_node(CG const& g, compare); } else { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.a, - (T*)nullptr, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + (T*)nullptr, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); @@ -1156,16 +1149,16 @@ __global__ void push_kernel(OutputIt elements, if (first_not_inserted < num_elements) { std::size_t p_ins_size = num_elements - first_not_inserted; push_partial_node(g, - elements + first_not_inserted, - p_ins_size, - heap, - size, - node_size, - locks, - p_buffer_size, - lowest_level_start, - shmem, - compare); + elements + first_not_inserted, + p_ins_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + shmem, + compare); } } @@ -1197,12 +1190,10 @@ __global__ void pop_kernel(OutputIt elements, { extern __shared__ int s[]; - shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, - node_size); + shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); cg::thread_block g = cg::this_thread_block(); - for (std::size_t i = blockIdx.x; i < num_elements / node_size; - i += gridDim.x) { + for (std::size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { pop_single_node(g, elements + i * node_size, heap, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 6066315e4..24fdfb4f6 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -62,8 +62,8 @@ namespace cuco { * @tparam Allocator Allocator defining how memory is allocated internally */ template , - typename Allocator = cuco::cuda_allocator> + typename Compare = thrust::less, + typename Allocator = cuco::cuda_allocator> class priority_queue { using int_allocator_type = typename std::allocator_traits::rebind_alloc; @@ -78,8 +78,9 @@ class priority_queue { * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage */ - priority_queue(std::size_t initial_capacity, Allocator const& alloc = Allocator{}, - cudaStream_t stream = 0); + priority_queue(std::size_t initial_capacity, + Allocator const& alloc = Allocator{}, + cudaStream_t stream = 0); /** * @brief Push elements into the priority queue @@ -198,17 +199,17 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where - /// the 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - std::size_t* d_p_buffer_size_; ///< Number of elements currently in the - /// partial buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` - Compare compare_{}; ///< Comparator used to order the elements in the queue + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where + /// the 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + std::size_t* d_p_buffer_size_; ///< Number of elements currently in the + /// partial buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` + Compare compare_{}; ///< Comparator used to order the elements in the queue }; /* @@ -237,25 +238,25 @@ class priority_queue { /// heap's lowest level int node_capacity_; ///< Capacity of the heap in nodes - T* d_heap_; ///< Pointer to an array of nodes, the 0th node - /// being the heap's partial buffer, and nodes - /// 1..(node_capacity_) being the heap, where the - /// 1st node is the root - int* d_size_; ///< Number of nodes currently in the heap - std::size_t* d_p_buffer_size_; ///< Number of elements currently in the - /// partial buffer - int* d_locks_; ///< Array of locks where `d_locks_[i]` is the - /// lock for the node starting at - /// d_heap_[node_size * i]` + T* d_heap_; ///< Pointer to an array of nodes, the 0th node + /// being the heap's partial buffer, and nodes + /// 1..(node_capacity_) being the heap, where the + /// 1st node is the root + int* d_size_; ///< Number of nodes currently in the heap + std::size_t* d_p_buffer_size_; ///< Number of elements currently in the + /// partial buffer + int* d_locks_; ///< Array of locks where `d_locks_[i]` is the + /// lock for the node starting at + /// d_heap_[node_size * i]` - int_allocator_type int_allocator_; ///< Allocator used to allocated ints - /// for example, the lock array - t_allocator_type t_allocator_; ///< Allocator used to allocate T's - /// and therefore nodes - size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate - /// size_t's, e.g. d_p_buffer_size_ + int_allocator_type int_allocator_; ///< Allocator used to allocated ints + /// for example, the lock array + t_allocator_type t_allocator_; ///< Allocator used to allocate T's + /// and therefore nodes + size_t_allocator_type size_t_allocator_; ///< Allocator used to allocate + /// size_t's, e.g. d_p_buffer_size_ - Compare compare_{}; ///< Comparator used to order the elements in the queue + Compare compare_{}; ///< Comparator used to order the elements in the queue }; } // namespace cuco From 192e263a9e36311cabb893a29d9bb1ba1dfb4219 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Fri, 17 Jun 2022 05:03:01 +0000 Subject: [PATCH 47/55] Snake case in priority_queue_test.cu --- tests/priority_queue/priority_queue_test.cu | 114 ++++++++++---------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 6b0430877..4249f2a9f 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -31,19 +31,19 @@ using namespace cuco; namespace cg = cooperative_groups; template -struct KVPair { +struct kv_pair { K first; V second; }; template -bool __host__ __device__ operator==(const KVPair& a, const KVPair& b) +bool __host__ __device__ operator==(const kv_pair& a, const kv_pair& b) { return a.first == b.first && a.second == b.second; } template -bool __host__ __device__ operator<(const KVPair& a, const KVPair& b) +bool __host__ __device__ operator<(const kv_pair& a, const kv_pair& b) { if (a.first == b.first) { return a.second < b.second; @@ -53,7 +53,7 @@ bool __host__ __device__ operator<(const KVPair& a, const KVPair& b) } template -struct KVLess { +struct kv_less { __host__ __device__ bool operator()(const T& a, const T& b) const { return a.first < b.first; } }; @@ -110,7 +110,7 @@ static void generate_element(T& e, std::mt19937& gen) } template -void generate_element(KVPair& e, std::mt19937& gen) +void generate_element(kv_pair& e, std::mt19937& gen) { generate_element(e.first, gen); generate_element(e.second, gen); @@ -182,55 +182,55 @@ TEST_CASE("Single uint32_t element", "") TEST_CASE("New node created on partial insertion") { - const size_t kInsertionSize = 600; - const size_t kNumElements = kInsertionSize * 2; + const size_t insertion_size = 600; + const size_t num_elements = insertion_size * 2; - priority_queue pq(kNumElements); + priority_queue pq(num_elements); - std::vector els = generate_elements(kNumElements); + std::vector els = generate_elements(num_elements); - std::vector first_insertion(els.begin(), els.begin() + kInsertionSize); + std::vector first_insertion(els.begin(), els.begin() + insertion_size); - std::vector second_insertion(els.begin() + kInsertionSize, els.end()); + std::vector second_insertion(els.begin() + insertion_size, els.end()); insert_to_queue(pq, first_insertion); insert_to_queue(pq, second_insertion); - auto popped_elements = pop_from_queue(pq, kInsertionSize); + auto popped_elements = pop_from_queue(pq, insertion_size); REQUIRE(is_valid_top_n>(popped_elements, els)); } TEST_CASE("Insert, delete, insert, delete", "") { - const size_t kFirstInsertionSize = 100'000; - const size_t kFirstDeletionSize = 10'000; - const size_t kSecondInsertionSize = 20'000; - const size_t kSecondDeletionSize = 50'000; + const size_t first_insertion_size = 100'000; + const size_t first_deletion_size = 10'000; + const size_t second_insertion_size = 20'000; + const size_t second_deletion_size = 50'000; using T = uint32_t; using Compare = thrust::less; - priority_queue pq(kFirstInsertionSize + kSecondInsertionSize); + priority_queue pq(first_insertion_size + second_insertion_size); - auto first_insertion_els = generate_elements(kFirstInsertionSize); + auto first_insertion_els = generate_elements(first_insertion_size); - auto second_insertion_els = generate_elements(kSecondInsertionSize); + auto second_insertion_els = generate_elements(second_insertion_size); insert_to_queue(pq, first_insertion_els); - auto first_popped_elements = pop_from_queue(pq, kFirstDeletionSize); + auto first_popped_elements = pop_from_queue(pq, first_deletion_size); insert_to_queue(pq, second_insertion_els); - auto second_popped_elements = pop_from_queue(pq, kSecondDeletionSize); + auto second_popped_elements = pop_from_queue(pq, second_deletion_size); std::vector remaining_elements; std::sort(first_insertion_els.begin(), first_insertion_els.end(), Compare{}); remaining_elements.insert(remaining_elements.end(), - first_insertion_els.begin() + kFirstDeletionSize, + first_insertion_els.begin() + first_deletion_size, first_insertion_els.end()); remaining_elements.insert( @@ -242,16 +242,16 @@ TEST_CASE("Insert, delete, insert, delete", "") TEST_CASE("Insertion and deletion on different streams", "") { - const size_t kInsertionSize = 100'000; - const size_t kDeletionSize = 10'000; + const size_t insertion_size = 100'000; + const size_t deletion_size = 10'000; using T = uint32_t; using Compare = thrust::less; - auto elements = generate_elements(kInsertionSize * 2); - thrust::device_vector insertion1(elements.begin(), elements.begin() + kInsertionSize); - thrust::device_vector insertion2(elements.begin() + kInsertionSize, elements.end()); + auto elements = generate_elements(insertion_size * 2); + thrust::device_vector insertion1(elements.begin(), elements.begin() + insertion_size); + thrust::device_vector insertion2(elements.begin() + insertion_size, elements.end()); - priority_queue pq(kInsertionSize * 2); + priority_queue pq(insertion_size * 2); cudaStream_t stream1, stream2; @@ -264,8 +264,8 @@ TEST_CASE("Insertion and deletion on different streams", "") cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); - thrust::device_vector deletion1(kDeletionSize); - thrust::device_vector deletion2(kDeletionSize); + thrust::device_vector deletion1(deletion_size); + thrust::device_vector deletion2(deletion_size); pq.pop(deletion1.begin(), deletion1.end(), stream1); pq.pop(deletion2.begin(), deletion2.end(), stream2); @@ -287,7 +287,7 @@ TEST_CASE("Insertion and deletion on different streams", "") } template -__global__ void DeviceAPIInsert(View view, InputIt begin, InputIt end) +__global__ void device_api_insert(View view, InputIt begin, InputIt end) { extern __shared__ int shmem[]; cg::thread_block g = cg::this_thread_block(); @@ -295,7 +295,7 @@ __global__ void DeviceAPIInsert(View view, InputIt begin, InputIt end) } template -__global__ void DeviceAPIDelete(View view, OutputIt begin, OutputIt end) +__global__ void device_api_delete(View view, OutputIt begin, OutputIt end) { extern __shared__ int shmem[]; cg::thread_block g = cg::this_thread_block(); @@ -304,26 +304,26 @@ __global__ void DeviceAPIDelete(View view, OutputIt begin, OutputIt end) TEST_CASE("Insertion and deletion with Device API", "") { - const size_t kInsertionSize = 2000; - const size_t kDeletionSize = 1000; + const size_t insertion_size = 2000; + const size_t deletion_size = 1000; using T = uint32_t; using Compare = thrust::less; - auto els = generate_elements(kInsertionSize); + auto els = generate_elements(insertion_size); thrust::device_vector d_els(els); - priority_queue pq(kInsertionSize); + priority_queue pq(insertion_size); - const int kBlockSize = 32; - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>>( + const int block_size = 32; + device_api_insert<<<1, block_size, pq.get_shmem_size(block_size)>>>( pq.get_mutable_device_view(), d_els.begin(), d_els.end()); cudaDeviceSynchronize(); - thrust::device_vector d_pop_result(kDeletionSize); + thrust::device_vector d_pop_result(deletion_size); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize)>>>( + device_api_delete<<<1, block_size, pq.get_shmem_size(block_size)>>>( pq.get_mutable_device_view(), d_pop_result.begin(), d_pop_result.end()); cudaDeviceSynchronize(); @@ -336,40 +336,40 @@ TEST_CASE("Insertion and deletion with Device API", "") TEST_CASE("Concurrent insertion and deletion with Device API", "") { - const size_t kInsertionSize = 1000; - const size_t kDeletionSize = 500; - const int kBlockSize = 32; + const size_t insertion_size = 1000; + const size_t deletion_size = 500; + const int block_size = 32; using T = uint32_t; using Compare = thrust::less; - auto els = generate_elements(kInsertionSize * 2); + auto els = generate_elements(insertion_size * 2); - thrust::device_vector insertion1(els.begin(), els.begin() + kInsertionSize); - thrust::device_vector insertion2(els.begin() + kInsertionSize, els.end()); + thrust::device_vector insertion1(els.begin(), els.begin() + insertion_size); + thrust::device_vector insertion2(els.begin() + insertion_size, els.end()); - priority_queue pq(kInsertionSize * 2); + priority_queue pq(insertion_size * 2); cudaStream_t stream1, stream2; cudaStreamCreate(&stream1); cudaStreamCreate(&stream2); - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>>( + device_api_insert<<<1, block_size, pq.get_shmem_size(block_size), stream1>>>( pq.get_mutable_device_view(), insertion1.begin(), insertion1.end()); - DeviceAPIInsert<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>>( + device_api_insert<<<1, block_size, pq.get_shmem_size(block_size), stream2>>>( pq.get_mutable_device_view(), insertion2.begin(), insertion2.end()); cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); - thrust::device_vector d_deletion1(kDeletionSize); - thrust::device_vector d_deletion2(kDeletionSize); + thrust::device_vector d_deletion1(deletion_size); + thrust::device_vector d_deletion2(deletion_size); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream1>>>( + device_api_delete<<<1, block_size, pq.get_shmem_size(block_size), stream1>>>( pq.get_mutable_device_view(), d_deletion1.begin(), d_deletion1.end()); - DeviceAPIDelete<<<1, kBlockSize, pq.get_shmem_size(kBlockSize), stream2>>>( + device_api_delete<<<1, block_size, pq.get_shmem_size(block_size), stream2>>>( pq.get_mutable_device_view(), d_deletion2.begin(), d_deletion2.end()); cudaStreamSynchronize(stream1); @@ -393,15 +393,15 @@ TEMPLATE_TEST_CASE_SIG( ((typename T, typename Compare, size_t N, size_t NumKeys), T, Compare, N, NumKeys), (uint32_t, thrust::less, 100, 10'000'000), (uint64_t, thrust::less, 100, 10'000'000), - (KVPair, KVLess>, 100, 10'000'000), + (kv_pair, kv_less>, 100, 10'000'000), (uint32_t, thrust::less, 10'000, 10'000'000), (uint64_t, thrust::less, 10'000, 10'000'000), (uint64_t, thrust::greater, 10'000, 10'000'000), - (KVPair, KVLess>, 10'000, 10'000'000), - (KVPair, KVLess>, 10'000, 10'000'000), + (kv_pair, kv_less>, 10'000, 10'000'000), + (kv_pair, kv_less>, 10'000, 10'000'000), (uint32_t, thrust::less, 10'000'000, 10'000'000), (uint64_t, thrust::less, 10'000'000, 10'000'000), - (KVPair, KVLess>, 10'000'000, 10'000'000)) + (kv_pair, kv_less>, 10'000'000, 10'000'000)) { priority_queue pq(NumKeys); From 66dd359876d22d15351b1d4a07392b58417666ad Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Jun 2022 05:03:35 +0000 Subject: [PATCH 48/55] [pre-commit.ci] auto code formatting --- tests/priority_queue/priority_queue_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 4249f2a9f..dfd689f3b 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -208,8 +208,8 @@ TEST_CASE("Insert, delete, insert, delete", "") const size_t first_deletion_size = 10'000; const size_t second_insertion_size = 20'000; const size_t second_deletion_size = 50'000; - using T = uint32_t; - using Compare = thrust::less; + using T = uint32_t; + using Compare = thrust::less; priority_queue pq(first_insertion_size + second_insertion_size); From 9da822f6c082073de36e8e15f7659667a05b1fa0 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Fri, 17 Jun 2022 05:17:25 +0000 Subject: [PATCH 49/55] kPBufferIdx -> p_buffer_idx and kRootIdx -> root_idx --- .../cuco/detail/priority_queue_kernels.cuh | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 977653a45..59b1c9671 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -24,8 +24,8 @@ namespace cuco { namespace detail { namespace cg = cooperative_groups; -constexpr int kPBufferIdx = 0; -constexpr int kRootIdx = 1; +constexpr int p_buffer_idx = 0; +constexpr int root_idx = 1; /* * Struct to hold pointers to the temp storage used by the priority @@ -571,7 +571,7 @@ __device__ void sink(CG const& g, shared_memory_layout shmem, Compare const& compare) { - std::size_t cur = kRootIdx; + std::size_t cur = root_idx; int dim = g.size(); @@ -748,7 +748,7 @@ __device__ void pop_single_node(CG const& g, int lane = g.thread_rank(); int dim = g.size(); - acquire_lock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[root_idx]); if (*size == 0) { copy_pairs(g, elements, heap, node_size); @@ -777,7 +777,7 @@ __device__ void pop_single_node(CG const& g, // Copy the target node to the root - if (tar != kRootIdx) { + if (tar != root_idx) { copy_pairs(g, &heap[node_size], &heap[tar * node_size], node_size); release_lock(g, &locks[tar]); @@ -789,7 +789,7 @@ __device__ void pop_single_node(CG const& g, merge_and_sort(g, &heap[node_size], - &heap[kPBufferIdx], + &heap[p_buffer_idx], shmem.a, shmem.b, node_size, @@ -852,7 +852,7 @@ __device__ void pop_partial_node(CG const& g, int lane = g.thread_rank(); int dim = g.size(); - acquire_lock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[root_idx]); if (*size == 0) { copy_pairs(g, elements, heap, num_elements); @@ -868,15 +868,15 @@ __device__ void pop_partial_node(CG const& g, if (lane == 0) { *p_buffer_size = n_p_buffer_size; } - release_lock(g, &locks[kRootIdx]); + release_lock(g, &locks[root_idx]); } else { - copy_pairs(g, elements, &heap[kRootIdx * node_size], num_elements); + copy_pairs(g, elements, &heap[root_idx * node_size], num_elements); g.sync(); if (*p_buffer_size >= num_elements) { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, + &heap[p_buffer_idx], + &heap[root_idx * node_size] + num_elements, shmem.a, shmem.b, *p_buffer_size, @@ -891,8 +891,8 @@ __device__ void pop_partial_node(CG const& g, g.sync(); - copy_pairs(g, &heap[kRootIdx * node_size], shmem.a, node_size); - copy_pairs(g, &heap[kPBufferIdx], shmem.b, *p_buffer_size); + copy_pairs(g, &heap[root_idx * node_size], shmem.a, node_size); + copy_pairs(g, &heap[p_buffer_idx], shmem.b, *p_buffer_size); g.sync(); @@ -908,8 +908,8 @@ __device__ void pop_partial_node(CG const& g, compare); } else { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, + &heap[p_buffer_idx], + &heap[root_idx * node_size] + num_elements, shmem.a, (T*)nullptr, *p_buffer_size, @@ -920,7 +920,7 @@ __device__ void pop_partial_node(CG const& g, g.sync(); - copy_pairs(g, &heap[kPBufferIdx], shmem.a, *p_buffer_size + node_size - num_elements); + copy_pairs(g, &heap[p_buffer_idx], shmem.a, *p_buffer_size + node_size - num_elements); int tar = insertion_order_index(*size, lowest_level_start); g.sync(); @@ -932,10 +932,10 @@ __device__ void pop_partial_node(CG const& g, if (lane == 0) { *size -= 1; } - if (tar != kRootIdx) { + if (tar != root_idx) { acquire_lock(g, &locks[tar]); - copy_pairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], node_size); + copy_pairs(g, &heap[root_idx * node_size], &heap[tar * node_size], node_size); g.sync(); @@ -943,7 +943,7 @@ __device__ void pop_partial_node(CG const& g, merge_and_sort(g, &heap[node_size], - &heap[kPBufferIdx], + &heap[p_buffer_idx], shmem.a, shmem.b, node_size, @@ -970,7 +970,7 @@ __device__ void pop_partial_node(CG const& g, shmem, compare); } else { - release_lock(g, &locks[kRootIdx]); + release_lock(g, &locks[root_idx]); } } } @@ -1009,7 +1009,7 @@ __device__ void push_partial_node(CG const& g, int lane = g.thread_rank(); int dim = g.size(); - acquire_lock(g, &locks[kRootIdx]); + acquire_lock(g, &locks[root_idx]); copy_pairs(g, shmem.b, elements, p_ins_size); @@ -1024,13 +1024,13 @@ __device__ void push_partial_node(CG const& g, int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); - if (cur_node != kRootIdx) { acquire_lock(g, &(locks[cur_node])); } + if (cur_node != root_idx) { acquire_lock(g, &(locks[cur_node])); } g.sync(); merge_and_sort(g, shmem.b, - &heap[kPBufferIdx], + &heap[p_buffer_idx], &heap[cur_node * node_size], shmem.a, p_ins_size, @@ -1045,7 +1045,7 @@ __device__ void push_partial_node(CG const& g, copy_pairs(g, heap, shmem.a, *p_buffer_size); - if (cur_node != kRootIdx) { release_lock(g, &locks[kRootIdx]); } + if (cur_node != root_idx) { release_lock(g, &locks[root_idx]); } swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); @@ -1057,7 +1057,7 @@ __device__ void push_partial_node(CG const& g, merge_and_sort(g, shmem.b, - &heap[kPBufferIdx], + &heap[p_buffer_idx], shmem.a, (T*)nullptr, p_ins_size, @@ -1079,7 +1079,7 @@ __device__ void push_partial_node(CG const& g, if (*size > 0) { merge_and_sort(g, &heap[node_size], - &heap[kPBufferIdx], + &heap[p_buffer_idx], shmem.a, shmem.b, node_size, @@ -1095,7 +1095,7 @@ __device__ void push_partial_node(CG const& g, g.sync(); } - release_lock(g, &locks[kRootIdx]); + release_lock(g, &locks[root_idx]); } } From 0cfdd945e16d5ad2ca9c34e0924d806dc414a6bd Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Jun 2022 01:54:27 +0000 Subject: [PATCH 50/55] Use const and constexpr wherever possible in priority queue files --- .../priority_queue/priority_queue_bench.cu | 4 +- include/cuco/detail/priority_queue.inl | 14 +- .../cuco/detail/priority_queue_kernels.cuh | 267 +++++++++--------- include/cuco/priority_queue.cuh | 4 +- tests/priority_queue/priority_queue_test.cu | 70 ++--- 5 files changed, 188 insertions(+), 171 deletions(-) diff --git a/benchmarks/priority_queue/priority_queue_bench.cu b/benchmarks/priority_queue/priority_queue_bench.cu index 261a5a6ad..b40e142c5 100644 --- a/benchmarks/priority_queue/priority_queue_bench.cu +++ b/benchmarks/priority_queue/priority_queue_bench.cu @@ -38,7 +38,7 @@ static void generate_kv_pairs_uniform(OutputIt output_begin, OutputIt output_end std::random_device rd; std::mt19937 gen{rd()}; - auto num_keys = std::distance(output_begin, output_end); + const auto num_keys = std::distance(output_begin, output_end); for (auto i = 0; i < num_keys; ++i) { output_begin[i] = {static_cast(gen()), static_cast(gen())}; @@ -55,7 +55,7 @@ static void BM_insert(::benchmark::State& state) std::vector> h_pairs(NumKeys); generate_kv_pairs_uniform(h_pairs.begin(), h_pairs.end()); - thrust::device_vector> d_pairs(h_pairs); + const thrust::device_vector> d_pairs(h_pairs); state.ResumeTiming(); pq.push(d_pairs.begin(), d_pairs.end()); diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 375dcc0de..6e0b3c5fc 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -32,7 +32,7 @@ priority_queue::priority_queue(std::size_t initial_capaci node_size_ = 1024; // Round up to the nearest multiple of node size - int nodes = ((initial_capacity + node_size_ - 1) / node_size_); + const int nodes = ((initial_capacity + node_size_ - 1) / node_size_); node_capacity_ = nodes; lowest_level_start_ = 1 << static_cast(std::log2(nodes)); @@ -71,7 +71,7 @@ template template void priority_queue::push(InputIt first, InputIt last, cudaStream_t stream) { - const int block_size = 256; + constexpr int block_size = 256; const int num_nodes = static_cast((last - first) / node_size_) + 1; const int num_blocks = std::min(64000, num_nodes); @@ -94,7 +94,7 @@ template template void priority_queue::pop(OutputIt first, OutputIt last, cudaStream_t stream) { - const int block_size = 256; + constexpr int block_size = 256; const int pop_size = last - first; const int num_nodes = static_cast(pop_size / node_size_) + 1; @@ -122,10 +122,10 @@ __device__ void priority_queue::device_mutable_view::push InputIt last, void* temp_storage) { - detail::shared_memory_layout shmem = + const detail::shared_memory_layout shmem = detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); - auto push_size = last - first; + const auto push_size = last - first; for (std::size_t i = 0; i < push_size / node_size_; i++) { detail::push_single_node(g, first + i * node_size_, @@ -160,10 +160,10 @@ __device__ void priority_queue::device_mutable_view::pop( OutputIt last, void* temp_storage) { - detail::shared_memory_layout shmem = + const detail::shared_memory_layout shmem = detail::get_shared_memory_layout((int*)temp_storage, g.size(), node_size_); - auto pop_size = last - first; + const auto pop_size = last - first; for (std::size_t i = 0; i < pop_size / node_size_; i++) { detail::pop_single_node(g, first + i * node_size_, diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 59b1c9671..54e83f16e 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -24,8 +24,8 @@ namespace cuco { namespace detail { namespace cg = cooperative_groups; -constexpr int p_buffer_idx = 0; -constexpr int root_idx = 1; +constexpr int kPBufferIdx = 0; +constexpr int kRootIdx = 1; /* * Struct to hold pointers to the temp storage used by the priority @@ -50,7 +50,8 @@ struct shared_memory_layout { * @returns The memory layout for the given group dimension and node size */ template -__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, std::size_t node_size) +__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, + std::size_t node_size) { shared_memory_layout result; result.intersections = s; @@ -98,10 +99,12 @@ __device__ void release_lock(CG const& g, int* l) * @param src_end Iterator to the end of the source array */ template -__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, InputIt2 src_end) +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, + InputIt2 src_end) { auto dst = dst_start + g.thread_rank(); - for (auto src = src_start + g.thread_rank(); src < src_end; dst += g.size(), src += g.size()) { + for (auto src = src_start + g.thread_rank(); src < src_end; + dst += g.size(), src += g.size()) { *dst = *src; } } @@ -115,9 +118,7 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, * @param num_pairs Number of pairs to copy */ template -__device__ void copy_pairs(CG const& g, - InputIt1 dst_start, - InputIt2 src_start, +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, std::size_t num_pairs) { copy_pairs(g, dst_start, src_start, src_start + num_pairs); @@ -141,13 +142,13 @@ __device__ void copy_pairs(CG const& g, */ template __device__ void merge_and_sort(CG const& g, - T* a, - T* b, - T* lo, - T* hi, - std::size_t node_size, - shared_memory_layout shmem, - Compare const& compare) + T* a, + T* b, + T* lo, + T* hi, + std::size_t node_size, + shared_memory_layout shmem, + Compare const& compare) { merge_and_sort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); } @@ -187,8 +188,8 @@ __device__ void merge_and_sort(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); if (num_elements_a == node_size && compare(a[node_size - 1], b[0])) { copy_pairs(g, lo, a, num_elements_a); @@ -205,7 +206,7 @@ __device__ void merge_and_sort(CG const& g, } // Array of size 2 * (blockDim.x + 1) - int* intersections = shmem.intersections; + int* const intersections = shmem.intersections; if (lane == 0) { intersections[0] = 0; @@ -216,15 +217,15 @@ __device__ void merge_and_sort(CG const& g, } // Calculate the diagonal spacing - int p = 2 * node_size / dim; + const int p = 2 * node_size / dim; // There will be one less diagonal than threads if (threadIdx.x != 0) { // i + j = (p * threadIdx.x - 1) - int j_bl = min((int)node_size - 1, p * lane - 1); - int i_bl = (p * lane - 1) - j_bl; + const int j_bl = min((int)node_size - 1, p * lane - 1); + const int i_bl = (p * lane - 1) - j_bl; - int diag_len = min(p * lane, (int)node_size - i_bl); + const int diag_len = min(p * lane, (int)node_size - i_bl); // Will be the location of the rightmost one // in the merge-path grid in terms of array a @@ -235,8 +236,8 @@ __device__ void merge_and_sort(CG const& g, // Binary search along the diagonal while (leftmost_zero - rightmost_one > 1) { - int i = (rightmost_one + leftmost_zero) / 2; - int j = (p * lane - 1) - i; + const int i = (rightmost_one + leftmost_zero) / 2; + const int j = (p * lane - 1) - i; if (i >= num_elements_a) { leftmost_zero = i; @@ -258,8 +259,8 @@ __device__ void merge_and_sort(CG const& g, int j = intersections[2 * lane + 1]; // Get the intersection that ends this partition - int i_max = min(intersections[2 * (lane + 1)], (int)num_elements_a); - int j_max = min(intersections[2 * (lane + 1) + 1], (int)num_elements_b); + const int i_max = min(intersections[2 * (lane + 1)], (int)num_elements_a); + const int j_max = min(intersections[2 * (lane + 1) + 1], (int)num_elements_b); // Insert location into the output array int ins_loc = lane * p; @@ -321,12 +322,13 @@ __device__ void merge_and_sort(CG const& g, */ template __device__ void pb_sort( - CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, Compare const& compare) + CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, + Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); - char* mask = (char*)temp; + char* const mask = (char*)temp; for (int i = lane; i < node_size; i += dim) { mask[i] = i < len; @@ -337,11 +339,12 @@ __device__ void pb_sort( for (int width = 2; width < node_size; width *= 2) { for (int jump = width / 2; jump >= 1; jump /= 2) { for (int i = lane; i < node_size / 2; i += dim) { - int start_jump = width / 2; - int left = (i / jump) * jump * 2 + i % jump; - int right = left + jump; + const int start_jump = width / 2; + const int left = (i / jump) * jump * 2 + i % jump; + const int right = left + jump; if ((i / start_jump) % 2 == 0) { - if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { + if (!mask[left] || + (mask[right] && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -351,7 +354,8 @@ __device__ void pb_sort( mask[right] = temp_mask; } } else { - if (!mask[right] || (mask[left] && compare(start[left], start[right]))) { + if (!mask[right] || + (mask[left] && compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -369,8 +373,8 @@ __device__ void pb_sort( // Merge to get the sorted result for (int jump = node_size / 2; jump >= 1; jump /= 2) { for (int i = lane; i < node_size / 2; i += dim) { - int left = (i / jump) * jump * 2 + i % jump; - int right = left + jump; + const int left = (i / jump) * jump * 2 + i % jump; + const int right = left + jump; if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; @@ -395,14 +399,14 @@ __device__ void pb_sort( */ __device__ int bit_reverse_perm(int x) { - int clz = __clz(x); + const int clz = __clz(x); - int bits = sizeof(int) * 8; - int high_bit = 1 << ((bits - 1) - clz); - int mask = high_bit - 1; + const int bits = sizeof(int) * 8; + const int high_bit = 1 << ((bits - 1) - clz); + const int mask = high_bit - 1; - int masked = x & mask; - int rev = __brev(masked) >> (clz + 1); + const int masked = x & mask; + const int rev = __brev(masked) >> (clz + 1); return high_bit | rev; } @@ -504,8 +508,8 @@ __device__ void swim(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); int cur_parent = parent(cur_node, lowest_level_start); @@ -515,7 +519,8 @@ __device__ void swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], heap[cur_parent * node_size + node_size - 1])) { + if (!compare(heap[cur_node * node_size], + heap[cur_parent * node_size + node_size - 1])) { release_lock(g, &(locks[cur_parent])); break; } @@ -537,8 +542,8 @@ __device__ void swim(CG const& g, g.sync(); release_lock(g, &(locks[cur_node])); - cur_node = cur_parent; - cur_parent = parent(cur_node, lowest_level_start); + cur_node = cur_parent; + cur_parent = parent(cur_node, lowest_level_start); } release_lock(g, &(locks[cur_node])); @@ -571,15 +576,15 @@ __device__ void sink(CG const& g, shared_memory_layout shmem, Compare const& compare) { - std::size_t cur = root_idx; + std::size_t cur = kRootIdx; - int dim = g.size(); + const int dim = g.size(); // sink the node - while (insertion_order_index(left_child(cur, lowest_level_start), lowest_level_start) <= - node_capacity) { - std::size_t left = left_child(cur, lowest_level_start); - std::size_t right = right_child(cur, lowest_level_start); + while (insertion_order_index(left_child(cur, lowest_level_start), + lowest_level_start) <= node_capacity) { + const std::size_t left = left_child(cur, lowest_level_start); + const std::size_t right = right_child(cur, lowest_level_start); acquire_lock(g, &locks[left]); @@ -606,7 +611,8 @@ __device__ void sink(CG const& g, // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left + 1) * node_size - 1], heap[(right + 1) * node_size - 1])) { + if (!compare(heap[(left + 1) * node_size - 1], + heap[(right + 1) * node_size - 1])) { hi = left; lo = right; } else { @@ -691,8 +697,8 @@ __device__ void push_single_node(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); copy_pairs(g, shmem.a, elements, elements + node_size); @@ -700,11 +706,11 @@ __device__ void push_single_node(CG const& g, pb_sort(g, shmem.a, node_size, node_size, shmem.b, compare); - int* cur_node_temp = (int*)shmem.intersections; + int* const cur_node_temp = (int*)shmem.intersections; if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); - int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); + const int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); acquire_lock(g, &(locks[cur_node])); @@ -712,7 +718,8 @@ __device__ void push_single_node(CG const& g, g.sync(); - swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); + swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, + shmem, compare); } /** @@ -745,10 +752,10 @@ __device__ void pop_single_node(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); - acquire_lock(g, &locks[root_idx]); + acquire_lock(g, &locks[kRootIdx]); if (*size == 0) { copy_pairs(g, elements, heap, node_size); @@ -760,7 +767,7 @@ __device__ void pop_single_node(CG const& g, // Find the target node (the last one inserted) and // decrement the size - std::size_t tar = insertion_order_index(*size, lowest_level_start); + const std::size_t tar = insertion_order_index(*size, lowest_level_start); if (tar != 1) { acquire_lock(g, &locks[tar]); } @@ -777,7 +784,7 @@ __device__ void pop_single_node(CG const& g, // Copy the target node to the root - if (tar != root_idx) { + if (tar != kRootIdx) { copy_pairs(g, &heap[node_size], &heap[tar * node_size], node_size); release_lock(g, &locks[tar]); @@ -789,7 +796,7 @@ __device__ void pop_single_node(CG const& g, merge_and_sort(g, &heap[node_size], - &heap[p_buffer_idx], + &heap[kPBufferIdx], shmem.a, shmem.b, node_size, @@ -849,16 +856,16 @@ __device__ void pop_partial_node(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); - acquire_lock(g, &locks[root_idx]); + acquire_lock(g, &locks[kRootIdx]); if (*size == 0) { copy_pairs(g, elements, heap, num_elements); g.sync(); - std::size_t n_p_buffer_size = *p_buffer_size - num_elements; + const std::size_t n_p_buffer_size = *p_buffer_size - num_elements; copy_pairs(g, shmem.a, heap + num_elements, n_p_buffer_size); @@ -868,22 +875,22 @@ __device__ void pop_partial_node(CG const& g, if (lane == 0) { *p_buffer_size = n_p_buffer_size; } - release_lock(g, &locks[root_idx]); + release_lock(g, &locks[kRootIdx]); } else { - copy_pairs(g, elements, &heap[root_idx * node_size], num_elements); + copy_pairs(g, elements, &heap[kRootIdx * node_size], num_elements); g.sync(); if (*p_buffer_size >= num_elements) { merge_and_sort(g, - &heap[p_buffer_idx], - &heap[root_idx * node_size] + num_elements, - shmem.a, - shmem.b, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + shmem.b, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); @@ -891,8 +898,8 @@ __device__ void pop_partial_node(CG const& g, g.sync(); - copy_pairs(g, &heap[root_idx * node_size], shmem.a, node_size); - copy_pairs(g, &heap[p_buffer_idx], shmem.b, *p_buffer_size); + copy_pairs(g, &heap[kRootIdx * node_size], shmem.a, node_size); + copy_pairs(g, &heap[kPBufferIdx], shmem.b, *p_buffer_size); g.sync(); @@ -908,21 +915,22 @@ __device__ void pop_partial_node(CG const& g, compare); } else { merge_and_sort(g, - &heap[p_buffer_idx], - &heap[root_idx * node_size] + num_elements, - shmem.a, - (T*)nullptr, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + (T*)nullptr, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); - copy_pairs(g, &heap[p_buffer_idx], shmem.a, *p_buffer_size + node_size - num_elements); + copy_pairs(g, &heap[kPBufferIdx], shmem.a, + *p_buffer_size + node_size - num_elements); - int tar = insertion_order_index(*size, lowest_level_start); + const int tar = insertion_order_index(*size, lowest_level_start); g.sync(); *p_buffer_size += node_size; @@ -932,10 +940,11 @@ __device__ void pop_partial_node(CG const& g, if (lane == 0) { *size -= 1; } - if (tar != root_idx) { + if (tar != kRootIdx) { acquire_lock(g, &locks[tar]); - copy_pairs(g, &heap[root_idx * node_size], &heap[tar * node_size], node_size); + copy_pairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], + node_size); g.sync(); @@ -943,7 +952,7 @@ __device__ void pop_partial_node(CG const& g, merge_and_sort(g, &heap[node_size], - &heap[p_buffer_idx], + &heap[kPBufferIdx], shmem.a, shmem.b, node_size, @@ -970,7 +979,7 @@ __device__ void pop_partial_node(CG const& g, shmem, compare); } else { - release_lock(g, &locks[root_idx]); + release_lock(g, &locks[kRootIdx]); } } } @@ -1006,10 +1015,10 @@ __device__ void push_partial_node(CG const& g, shared_memory_layout shmem, Compare const& compare) { - int lane = g.thread_rank(); - int dim = g.size(); + const int lane = g.thread_rank(); + const int dim = g.size(); - acquire_lock(g, &locks[root_idx]); + acquire_lock(g, &locks[kRootIdx]); copy_pairs(g, shmem.b, elements, p_ins_size); @@ -1018,19 +1027,20 @@ __device__ void push_partial_node(CG const& g, // There is enough data for a new node, in which case we // construct a new node and insert it if (*p_buffer_size + p_ins_size >= node_size) { - int* cur_node_temp = shmem.intersections; + int* const cur_node_temp = shmem.intersections; if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); - int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); + const int cur_node = insertion_order_index(*cur_node_temp, + lowest_level_start); - if (cur_node != root_idx) { acquire_lock(g, &(locks[cur_node])); } + if (cur_node != kRootIdx) { acquire_lock(g, &(locks[cur_node])); } g.sync(); merge_and_sort(g, shmem.b, - &heap[p_buffer_idx], + &heap[kPBufferIdx], &heap[cur_node * node_size], shmem.a, p_ins_size, @@ -1045,7 +1055,7 @@ __device__ void push_partial_node(CG const& g, copy_pairs(g, heap, shmem.a, *p_buffer_size); - if (cur_node != root_idx) { release_lock(g, &locks[root_idx]); } + if (cur_node != kRootIdx) { release_lock(g, &locks[kRootIdx]); } swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); @@ -1057,7 +1067,7 @@ __device__ void push_partial_node(CG const& g, merge_and_sort(g, shmem.b, - &heap[p_buffer_idx], + &heap[kPBufferIdx], shmem.a, (T*)nullptr, p_ins_size, @@ -1079,7 +1089,7 @@ __device__ void push_partial_node(CG const& g, if (*size > 0) { merge_and_sort(g, &heap[node_size], - &heap[p_buffer_idx], + &heap[kPBufferIdx], shmem.a, shmem.b, node_size, @@ -1095,7 +1105,7 @@ __device__ void push_partial_node(CG const& g, g.sync(); } - release_lock(g, &locks[root_idx]); + release_lock(g, &locks[kRootIdx]); } } @@ -1127,7 +1137,9 @@ __global__ void push_kernel(OutputIt elements, { extern __shared__ int s[]; - shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); + const shared_memory_layout shmem = get_shared_memory_layout(s, + blockDim.x, + node_size); // We push as many elements as possible as full nodes, // then deal with the remaining elements as a partial insertion @@ -1144,21 +1156,21 @@ __global__ void push_kernel(OutputIt elements, // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial insertion - std::size_t first_not_inserted = (num_elements / node_size) * node_size; + const std::size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { - std::size_t p_ins_size = num_elements - first_not_inserted; + const std::size_t p_ins_size = num_elements - first_not_inserted; push_partial_node(g, - elements + first_not_inserted, - p_ins_size, - heap, - size, - node_size, - locks, - p_buffer_size, - lowest_level_start, - shmem, - compare); + elements + first_not_inserted, + p_ins_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + shmem, + compare); } } @@ -1190,10 +1202,13 @@ __global__ void pop_kernel(OutputIt elements, { extern __shared__ int s[]; - shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); + const shared_memory_layout shmem = get_shared_memory_layout(s, + blockDim.x, + node_size); cg::thread_block g = cg::this_thread_block(); - for (std::size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { + for (std::size_t i = blockIdx.x; i < num_elements / node_size; + i += gridDim.x) { pop_single_node(g, elements + i * node_size, heap, @@ -1212,10 +1227,10 @@ __global__ void pop_kernel(OutputIt elements, // If node_size does not divide num_elements, there are some leftover // elements for which we must perform a partial deletion - std::size_t first_not_inserted = (num_elements / node_size) * node_size; + const std::size_t first_not_inserted = (num_elements / node_size) * node_size; if (first_not_inserted < num_elements) { - std::size_t p_del_size = num_elements - first_not_inserted; + const std::size_t p_del_size = num_elements - first_not_inserted; pop_partial_node(g, elements + first_not_inserted, p_del_size, diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 24fdfb4f6..d08c2a43c 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -115,7 +115,7 @@ class priority_queue { * @param block_size Size of the blocks to calculate storage for * @return The amount of temporary storage required in bytes */ - int get_shmem_size(int block_size) + int get_shmem_size(int block_size) const { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); int node_bytes = node_size_ * sizeof(T); @@ -166,7 +166,7 @@ class priority_queue { * @param block_size Size of the cooperative groups to calculate storage for * @return The amount of temporary storage required in bytes */ - __device__ int get_shmem_size(int block_size) + __device__ int get_shmem_size(int block_size) const { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); int node_bytes = node_size_ * sizeof(T); diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index dfd689f3b..2636d13a6 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -72,18 +72,18 @@ std::map construct_count_map(std::vector& a) } template -bool is_valid_top_n(std::vector& top_n, std::vector& elements) +bool is_valid_top_n(std::vector top_n, std::vector elements) { - auto top_n_map = construct_count_map(top_n); - auto elements_map = construct_count_map(elements); + const auto top_n_map = construct_count_map(top_n); + const auto elements_map = construct_count_map(elements); - size_t n = top_n.size(); + const size_t n = top_n.size(); // 1. Check that the count of each element in the top n is less than or // equal to the count of that element overall in the queue for (auto& pair : top_n_map) { if (elements_map.find(pair.first) == elements_map.end() || - elements_map[pair.first] < pair.second) { + elements_map.at(pair.first) < pair.second) { return false; } } @@ -95,8 +95,8 @@ bool is_valid_top_n(std::vector& top_n, std::vector& elements) std::sort(top_n.begin(), top_n.end(), Compare{}); for (int i = 0; i < top_n.size(); i++) { - T max = elements[i]; - T e = top_n[i]; + const T max = elements[i]; + const T e = top_n[i]; if (Compare{}(max, e)) { return false; } } @@ -132,9 +132,10 @@ static std::vector generate_elements(size_t num_keys) } template -static void insert_to_queue(priority_queue& pq, std::vector& v) +static void insert_to_queue(priority_queue& pq, + const std::vector& v) { - thrust::device_vector d_v(v); + const thrust::device_vector d_v(v); pq.push(d_v.begin(), d_v.end()); @@ -150,7 +151,7 @@ static std::vector pop_from_queue(priority_queue& pq, size_t n) cudaDeviceSynchronize(); - thrust::host_vector h_popped(d_popped); + const thrust::host_vector h_popped(d_popped); std::vector result(h_popped.size()); @@ -162,11 +163,12 @@ static std::vector pop_from_queue(priority_queue& pq, size_t n) // Insert elements into the queue and check that they are // all returned when removed from the queue template -bool test_insertion_and_deletion(priority_queue& pq, std::vector& elements, size_t n) +bool test_insertion_and_deletion(priority_queue& pq, + const std::vector& elements, size_t n) { insert_to_queue(pq, elements); - auto popped_elements = pop_from_queue(pq, n); + const auto popped_elements = pop_from_queue(pq, n); return is_valid_top_n(popped_elements, elements); } @@ -175,7 +177,7 @@ TEST_CASE("Single uint32_t element", "") { priority_queue pq(1); - std::vector els = {1}; + const std::vector els = {1}; REQUIRE(test_insertion_and_deletion(pq, els, 1)); } @@ -197,7 +199,7 @@ TEST_CASE("New node created on partial insertion") insert_to_queue(pq, second_insertion); - auto popped_elements = pop_from_queue(pq, insertion_size); + const auto popped_elements = pop_from_queue(pq, insertion_size); REQUIRE(is_valid_top_n>(popped_elements, els)); } @@ -208,22 +210,22 @@ TEST_CASE("Insert, delete, insert, delete", "") const size_t first_deletion_size = 10'000; const size_t second_insertion_size = 20'000; const size_t second_deletion_size = 50'000; - using T = uint32_t; - using Compare = thrust::less; + using T = uint32_t; + using Compare = thrust::less; priority_queue pq(first_insertion_size + second_insertion_size); auto first_insertion_els = generate_elements(first_insertion_size); - auto second_insertion_els = generate_elements(second_insertion_size); + const auto second_insertion_els = generate_elements(second_insertion_size); insert_to_queue(pq, first_insertion_els); - auto first_popped_elements = pop_from_queue(pq, first_deletion_size); + const auto first_popped_elements = pop_from_queue(pq, first_deletion_size); insert_to_queue(pq, second_insertion_els); - auto second_popped_elements = pop_from_queue(pq, second_deletion_size); + const auto second_popped_elements = pop_from_queue(pq, second_deletion_size); std::vector remaining_elements; @@ -247,9 +249,9 @@ TEST_CASE("Insertion and deletion on different streams", "") using T = uint32_t; using Compare = thrust::less; - auto elements = generate_elements(insertion_size * 2); - thrust::device_vector insertion1(elements.begin(), elements.begin() + insertion_size); - thrust::device_vector insertion2(elements.begin() + insertion_size, elements.end()); + const auto elements = generate_elements(insertion_size * 2); + const thrust::device_vector insertion1(elements.begin(), elements.begin() + insertion_size); + const thrust::device_vector insertion2(elements.begin() + insertion_size, elements.end()); priority_queue pq(insertion_size * 2); @@ -273,8 +275,8 @@ TEST_CASE("Insertion and deletion on different streams", "") cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); - thrust::host_vector h_deletion1(deletion1); - thrust::host_vector h_deletion2(deletion2); + const thrust::host_vector h_deletion1(deletion1); + const thrust::host_vector h_deletion2(deletion2); std::vector popped_elements(h_deletion1.begin(), h_deletion1.end()); @@ -309,9 +311,9 @@ TEST_CASE("Insertion and deletion with Device API", "") using T = uint32_t; using Compare = thrust::less; - auto els = generate_elements(insertion_size); + const auto els = generate_elements(insertion_size); - thrust::device_vector d_els(els); + const thrust::device_vector d_els(els); priority_queue pq(insertion_size); @@ -328,8 +330,8 @@ TEST_CASE("Insertion and deletion with Device API", "") cudaDeviceSynchronize(); - thrust::host_vector h_pop_result(d_pop_result); - std::vector pop_result(h_pop_result.begin(), h_pop_result.end()); + const thrust::host_vector h_pop_result(d_pop_result); + const std::vector pop_result(h_pop_result.begin(), h_pop_result.end()); REQUIRE(is_valid_top_n(pop_result, els)); } @@ -342,10 +344,10 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") using T = uint32_t; using Compare = thrust::less; - auto els = generate_elements(insertion_size * 2); + const auto els = generate_elements(insertion_size * 2); - thrust::device_vector insertion1(els.begin(), els.begin() + insertion_size); - thrust::device_vector insertion2(els.begin() + insertion_size, els.end()); + const thrust::device_vector insertion1(els.begin(), els.begin() + insertion_size); + const thrust::device_vector insertion2(els.begin() + insertion_size, els.end()); priority_queue pq(insertion_size * 2); @@ -375,8 +377,8 @@ TEST_CASE("Concurrent insertion and deletion with Device API", "") cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); - thrust::host_vector h_deletion1(d_deletion1); - thrust::host_vector h_deletion2(d_deletion2); + const thrust::host_vector h_deletion1(d_deletion1); + const thrust::host_vector h_deletion2(d_deletion2); std::vector result(h_deletion1.begin(), h_deletion1.end()); result.insert(result.end(), h_deletion2.begin(), h_deletion2.end()); @@ -405,7 +407,7 @@ TEMPLATE_TEST_CASE_SIG( { priority_queue pq(NumKeys); - auto els = generate_elements(NumKeys); + const auto els = generate_elements(NumKeys); REQUIRE(test_insertion_and_deletion(pq, els, N)); } From 828b00b3a79adf8bfc75bfea1eadb74b1e0e4b96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Jun 2022 01:54:45 +0000 Subject: [PATCH 51/55] [pre-commit.ci] auto code formatting --- include/cuco/detail/priority_queue.inl | 2 +- .../cuco/detail/priority_queue_kernels.cuh | 129 ++++++++---------- tests/priority_queue/priority_queue_test.cu | 10 +- 3 files changed, 63 insertions(+), 78 deletions(-) diff --git a/include/cuco/detail/priority_queue.inl b/include/cuco/detail/priority_queue.inl index 6e0b3c5fc..189166c51 100644 --- a/include/cuco/detail/priority_queue.inl +++ b/include/cuco/detail/priority_queue.inl @@ -95,7 +95,7 @@ template void priority_queue::pop(OutputIt first, OutputIt last, cudaStream_t stream) { constexpr int block_size = 256; - const int pop_size = last - first; + const int pop_size = last - first; const int num_nodes = static_cast(pop_size / node_size_) + 1; const int num_blocks = std::min(64000, num_nodes); diff --git a/include/cuco/detail/priority_queue_kernels.cuh b/include/cuco/detail/priority_queue_kernels.cuh index 54e83f16e..6ec36233b 100644 --- a/include/cuco/detail/priority_queue_kernels.cuh +++ b/include/cuco/detail/priority_queue_kernels.cuh @@ -50,8 +50,7 @@ struct shared_memory_layout { * @returns The memory layout for the given group dimension and node size */ template -__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, - std::size_t node_size) +__device__ shared_memory_layout get_shared_memory_layout(int* s, int dim, std::size_t node_size) { shared_memory_layout result; result.intersections = s; @@ -99,12 +98,10 @@ __device__ void release_lock(CG const& g, int* l) * @param src_end Iterator to the end of the source array */ template -__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, - InputIt2 src_end) +__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, InputIt2 src_end) { auto dst = dst_start + g.thread_rank(); - for (auto src = src_start + g.thread_rank(); src < src_end; - dst += g.size(), src += g.size()) { + for (auto src = src_start + g.thread_rank(); src < src_end; dst += g.size(), src += g.size()) { *dst = *src; } } @@ -118,7 +115,9 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, * @param num_pairs Number of pairs to copy */ template -__device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, +__device__ void copy_pairs(CG const& g, + InputIt1 dst_start, + InputIt2 src_start, std::size_t num_pairs) { copy_pairs(g, dst_start, src_start, src_start + num_pairs); @@ -142,13 +141,13 @@ __device__ void copy_pairs(CG const& g, InputIt1 dst_start, InputIt2 src_start, */ template __device__ void merge_and_sort(CG const& g, - T* a, - T* b, - T* lo, - T* hi, - std::size_t node_size, - shared_memory_layout shmem, - Compare const& compare) + T* a, + T* b, + T* lo, + T* hi, + std::size_t node_size, + shared_memory_layout shmem, + Compare const& compare) { merge_and_sort(g, a, b, lo, hi, node_size, node_size, node_size, shmem, compare); } @@ -322,8 +321,7 @@ __device__ void merge_and_sort(CG const& g, */ template __device__ void pb_sort( - CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, - Compare const& compare) + CG const& g, T* start, std::size_t len, std::size_t node_size, T* temp, Compare const& compare) { const int lane = g.thread_rank(); const int dim = g.size(); @@ -343,8 +341,7 @@ __device__ void pb_sort( const int left = (i / jump) * jump * 2 + i % jump; const int right = left + jump; if ((i / start_jump) % 2 == 0) { - if (!mask[left] || - (mask[right] && !compare(start[left], start[right]))) { + if (!mask[left] || (mask[right] && !compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -354,8 +351,7 @@ __device__ void pb_sort( mask[right] = temp_mask; } } else { - if (!mask[right] || - (mask[left] && compare(start[left], start[right]))) { + if (!mask[right] || (mask[left] && compare(start[left], start[right]))) { auto temp = start[left]; start[left] = start[right]; start[right] = temp; @@ -519,8 +515,7 @@ __device__ void swim(CG const& g, // If the heap property is already satisfied for this node and its // parent we are done - if (!compare(heap[cur_node * node_size], - heap[cur_parent * node_size + node_size - 1])) { + if (!compare(heap[cur_node * node_size], heap[cur_parent * node_size + node_size - 1])) { release_lock(g, &(locks[cur_parent])); break; } @@ -542,8 +537,8 @@ __device__ void swim(CG const& g, g.sync(); release_lock(g, &(locks[cur_node])); - cur_node = cur_parent; - cur_parent = parent(cur_node, lowest_level_start); + cur_node = cur_parent; + cur_parent = parent(cur_node, lowest_level_start); } release_lock(g, &(locks[cur_node])); @@ -581,8 +576,8 @@ __device__ void sink(CG const& g, const int dim = g.size(); // sink the node - while (insertion_order_index(left_child(cur, lowest_level_start), - lowest_level_start) <= node_capacity) { + while (insertion_order_index(left_child(cur, lowest_level_start), lowest_level_start) <= + node_capacity) { const std::size_t left = left_child(cur, lowest_level_start); const std::size_t right = right_child(cur, lowest_level_start); @@ -611,8 +606,7 @@ __device__ void sink(CG const& g, // In order to ensure we preserve the heap property, // we put the largest node_size elements in the child // that previously contained the largest element - if (!compare(heap[(left + 1) * node_size - 1], - heap[(right + 1) * node_size - 1])) { + if (!compare(heap[(left + 1) * node_size - 1], heap[(right + 1) * node_size - 1])) { hi = left; lo = right; } else { @@ -718,8 +712,7 @@ __device__ void push_single_node(CG const& g, g.sync(); - swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, - shmem, compare); + swim(g, cur_node, heap, size, node_size, locks, lowest_level_start, shmem, compare); } /** @@ -882,15 +875,15 @@ __device__ void pop_partial_node(CG const& g, if (*p_buffer_size >= num_elements) { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.a, - shmem.b, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + shmem.b, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); @@ -915,20 +908,19 @@ __device__ void pop_partial_node(CG const& g, compare); } else { merge_and_sort(g, - &heap[kPBufferIdx], - &heap[kRootIdx * node_size] + num_elements, - shmem.a, - (T*)nullptr, - *p_buffer_size, - node_size - num_elements, - node_size, - shmem, - compare); + &heap[kPBufferIdx], + &heap[kRootIdx * node_size] + num_elements, + shmem.a, + (T*)nullptr, + *p_buffer_size, + node_size - num_elements, + node_size, + shmem, + compare); g.sync(); - copy_pairs(g, &heap[kPBufferIdx], shmem.a, - *p_buffer_size + node_size - num_elements); + copy_pairs(g, &heap[kPBufferIdx], shmem.a, *p_buffer_size + node_size - num_elements); const int tar = insertion_order_index(*size, lowest_level_start); g.sync(); @@ -943,8 +935,7 @@ __device__ void pop_partial_node(CG const& g, if (tar != kRootIdx) { acquire_lock(g, &locks[tar]); - copy_pairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], - node_size); + copy_pairs(g, &heap[kRootIdx * node_size], &heap[tar * node_size], node_size); g.sync(); @@ -1031,8 +1022,7 @@ __device__ void push_partial_node(CG const& g, if (lane == 0) { *cur_node_temp = atomicAdd(size, 1) + 1; } g.sync(); - const int cur_node = insertion_order_index(*cur_node_temp, - lowest_level_start); + const int cur_node = insertion_order_index(*cur_node_temp, lowest_level_start); if (cur_node != kRootIdx) { acquire_lock(g, &(locks[cur_node])); } @@ -1137,9 +1127,7 @@ __global__ void push_kernel(OutputIt elements, { extern __shared__ int s[]; - const shared_memory_layout shmem = get_shared_memory_layout(s, - blockDim.x, - node_size); + const shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); // We push as many elements as possible as full nodes, // then deal with the remaining elements as a partial insertion @@ -1161,16 +1149,16 @@ __global__ void push_kernel(OutputIt elements, if (first_not_inserted < num_elements) { const std::size_t p_ins_size = num_elements - first_not_inserted; push_partial_node(g, - elements + first_not_inserted, - p_ins_size, - heap, - size, - node_size, - locks, - p_buffer_size, - lowest_level_start, - shmem, - compare); + elements + first_not_inserted, + p_ins_size, + heap, + size, + node_size, + locks, + p_buffer_size, + lowest_level_start, + shmem, + compare); } } @@ -1202,13 +1190,10 @@ __global__ void pop_kernel(OutputIt elements, { extern __shared__ int s[]; - const shared_memory_layout shmem = get_shared_memory_layout(s, - blockDim.x, - node_size); + const shared_memory_layout shmem = get_shared_memory_layout(s, blockDim.x, node_size); cg::thread_block g = cg::this_thread_block(); - for (std::size_t i = blockIdx.x; i < num_elements / node_size; - i += gridDim.x) { + for (std::size_t i = blockIdx.x; i < num_elements / node_size; i += gridDim.x) { pop_single_node(g, elements + i * node_size, heap, diff --git a/tests/priority_queue/priority_queue_test.cu b/tests/priority_queue/priority_queue_test.cu index 2636d13a6..84d3353bc 100644 --- a/tests/priority_queue/priority_queue_test.cu +++ b/tests/priority_queue/priority_queue_test.cu @@ -132,8 +132,7 @@ static std::vector generate_elements(size_t num_keys) } template -static void insert_to_queue(priority_queue& pq, - const std::vector& v) +static void insert_to_queue(priority_queue& pq, const std::vector& v) { const thrust::device_vector d_v(v); @@ -164,7 +163,8 @@ static std::vector pop_from_queue(priority_queue& pq, size_t n) // all returned when removed from the queue template bool test_insertion_and_deletion(priority_queue& pq, - const std::vector& elements, size_t n) + const std::vector& elements, + size_t n) { insert_to_queue(pq, elements); @@ -210,8 +210,8 @@ TEST_CASE("Insert, delete, insert, delete", "") const size_t first_deletion_size = 10'000; const size_t second_insertion_size = 20'000; const size_t second_deletion_size = 50'000; - using T = uint32_t; - using Compare = thrust::less; + using T = uint32_t; + using Compare = thrust::less; priority_queue pq(first_insertion_size + second_insertion_size); From 1932418e3106c1f7122b6b63a1e3de14102e5af4 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Jun 2022 12:29:49 -0700 Subject: [PATCH 52/55] Add missing const in priority queue Co-authored-by: Yunsong Wang --- include/cuco/priority_queue.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index d08c2a43c..09f96332f 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -115,7 +115,7 @@ class priority_queue { * @param block_size Size of the blocks to calculate storage for * @return The amount of temporary storage required in bytes */ - int get_shmem_size(int block_size) const + int get_shmem_size(int const block_size) const { int intersection_bytes = 2 * (block_size + 1) * sizeof(int); int node_bytes = node_size_ * sizeof(T); @@ -219,7 +219,7 @@ class priority_queue { * * @return A device view */ - device_mutable_view get_mutable_device_view() + device_mutable_view get_mutable_device_view() const noexcept { return device_mutable_view(node_size_, d_heap_, From 7c4b1f61b1cdf7d3eb2e6d614b812fa2d4b28969 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Jun 2022 12:44:55 -0700 Subject: [PATCH 53/55] Add docs for stream parameter to priority queue ctor Co-authored-by: Yunsong Wang --- include/cuco/priority_queue.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 09f96332f..80c7b40d4 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -77,6 +77,7 @@ class priority_queue { * * @param initial_capacity The number of elements the priority queue can hold * @param alloc Allocator used for allocating device storage + * @param stream Stream used for constructing the priority queue */ priority_queue(std::size_t initial_capacity, Allocator const& alloc = Allocator{}, From 838e4ea7affb3912edd238596330d8c1f947ea25 Mon Sep 17 00:00:00 2001 From: Andrew Briand Date: Sun, 19 Jun 2022 12:53:29 -0700 Subject: [PATCH 54/55] Add value_type to priority_queue::device_mutable_view Co-authored-by: Yunsong Wang --- include/cuco/priority_queue.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 80c7b40d4..0f9968557 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -130,6 +130,7 @@ class priority_queue { class device_mutable_view { public: + using value_type = T; /** * @brief Push elements into the priority queue * From d58dd9fedde721a264c8ae960f7393a3a3b08c58 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Jun 2022 19:53:37 +0000 Subject: [PATCH 55/55] [pre-commit.ci] auto code formatting --- include/cuco/priority_queue.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/priority_queue.cuh b/include/cuco/priority_queue.cuh index 0f9968557..a7c0d3a1a 100644 --- a/include/cuco/priority_queue.cuh +++ b/include/cuco/priority_queue.cuh @@ -130,7 +130,7 @@ class priority_queue { class device_mutable_view { public: - using value_type = T; + using value_type = T; /** * @brief Push elements into the priority queue *