Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactors thrust::unique_by_key to use cub::DeviceSelect::UniqueByKey #1245

Merged
merged 10 commits into from
Jan 5, 2024
7 changes: 6 additions & 1 deletion cub/cub/agent/agent_unique_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,9 @@ struct AgentUniqueByKey

CTA_SYNC();

// Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
// types
#pragma unroll(1)
for (int item = threadIdx.x;
item < num_tile_selections;
item += BLOCK_THREADS)
Expand Down Expand Up @@ -626,7 +629,9 @@ struct AgentUniqueByKey
{
// Blocks are launched in increasing order, so just assign one tile per block
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile

// Global offset for the current tile
OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(ITEMS_PER_TILE);

if (tile_idx < num_tiles - 1)
{
Expand Down
268 changes: 213 additions & 55 deletions cub/cub/device/device_select.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,14 @@
# pragma system_header
#endif // no system header

#include <iterator>
#include <stdio.h>

#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_select_if.cuh>
#include <cub/device/dispatch/dispatch_unique_by_key.cuh>
#include <cub/util_deprecated.cuh>

#include <iterator>
#include <stdio.h>

CUB_NAMESPACE_BEGIN


Expand Down Expand Up @@ -837,6 +838,165 @@ struct DeviceSelect
stream);
}

//! @rst
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
//! equal-valued keys, only the first key and its value from each run is selectively copied
//! to ``d_keys_out`` and ``d_values_out``.
//! The total number of items selected is written to ``d_num_selected_out``.
//!
//! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
//! - Copies of the selected items are compacted into ``d_out`` and maintain
//! their original relative ordering.
//! - In-place operations are not supported. There must be no overlap between
//! any of the provided ranges:
//!
//! - ``[d_keys_in, d_keys_in + num_items)``
//! - ``[d_keys_out, d_keys_out + *d_num_selected_out)``
//! - ``[d_values_in, d_values_in + num_items)``
//! - ``[d_values_out, d_values_out + *d_num_selected_out)``
//! - ``[d_num_selected_out, d_num_selected_out + 1)``
//!
//! - @devicestorage
//!
//! Snippet
//! +++++++++++++++++++++++++++++++++++++++++++++
//!
//! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
//!
//! .. code-block:: c++
//!
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
//!
//! // Declare, allocate, and initialize device-accessible pointers
//! // for input and output
//! int num_items; // e.g., 8
//! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
//! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
//! int *d_keys_out; // e.g., [ , , , , , , , ]
//! int *d_values_out; // e.g., [ , , , , , , , ]
//! int *d_num_selected_out; // e.g., [ ]
//! ...
//!
//! // Determine temporary device storage requirements
//! void *d_temp_storage = NULL;
//! size_t temp_storage_bytes = 0;
//! cub::DeviceSelect::UniqueByKey(
//! d_temp_storage, temp_storage_bytes,
//! d_keys_in, d_values_in,
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
//!
//! // Allocate temporary storage
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
//!
//! // Run selection
//! cub::DeviceSelect::UniqueByKey(
//! d_temp_storage, temp_storage_bytes,
//! d_keys_in, d_values_in,
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
//!
//! // d_keys_out <-- [0, 2, 9, 5, 8]
//! // d_values_out <-- [1, 2, 4, 5, 8]
//! // d_num_selected_out <-- [5]
//!
//! @endrst
//!
//! @tparam KeyInputIteratorT
//! **[inferred]** Random-access input iterator type for reading input keys @iterator
//!
//! @tparam ValueInputIteratorT
//! **[inferred]** Random-access input iterator type for reading input values @iterator
//!
//! @tparam KeyOutputIteratorT
//! **[inferred]** Random-access output iterator type for writing selected keys @iterator
//!
//! @tparam ValueOutputIteratorT
//! **[inferred]** Random-access output iterator type for writing selected values @iterator
//!
//! @tparam NumSelectedIteratorT
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
//!
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @tparam EqualityOpT
//! **[inferred]** Type of equality_op
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
//! required allocation size is written to `temp_storage_bytes` and no work is done.
//!
//! @param[in,out] temp_storage_bytes
//! Reference to size in bytes of `d_temp_storage` allocation
//!
//! @param[in] d_keys_in
//! Pointer to the input sequence of keys
//!
//! @param[in] d_values_in
//! Pointer to the input sequence of values
//!
//! @param[out] d_keys_out
//! Pointer to the output sequence of selected keys
//!
//! @param[out] d_values_out
//! Pointer to the output sequence of selected values
//!
//! @param[out] d_num_selected_out
//! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
//!
//! @param[in] num_items
//! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
//!
//! @param[in] equality_op
//! Binary predicate to determine equality
//!
//! @param[in] stream
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! @endrst
template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT,
typename NumItemsT,
typename EqualityOpT>
CUB_RUNTIME_FUNCTION __forceinline__ static //
typename ::cuda::std::enable_if< //
!::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, //
cudaError_t>::type
UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
EqualityOpT equality_op,
cudaStream_t stream = 0)
{
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;

return DispatchUniqueByKey<
KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT,
EqualityOpT,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
equality_op,
static_cast<OffsetT>(num_items),
stream);
}

//! @rst
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
//! equal-valued keys, only the first key and its value from each run is selectively copied
Expand Down Expand Up @@ -914,6 +1074,9 @@ struct DeviceSelect
//! @tparam NumSelectedIteratorT
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
//!
//! @tparam NumItemsT
//! **[inferred]** Type of num_items
//!
//! @param[in] d_temp_storage
//! Device-accessible allocation of temporary storage. When `nullptr`, the
//! required allocation size is written to `temp_storage_bytes` and no work is done.
Expand Down Expand Up @@ -947,72 +1110,67 @@ struct DeviceSelect
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream = 0)
typename NumSelectedIteratorT,
typename NumItemsT>
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
cudaStream_t stream = 0)
{
using OffsetT = int;
using EqualityOp = Equality;

return DispatchUniqueByKey<KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT,
EqualityOp,
OffsetT>::Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
EqualityOp(),
num_items,
stream);
return UniqueByKey(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
Equality{},
stream);
}

template <typename KeyInputIteratorT,
typename ValueInputIteratorT,
typename KeyOutputIteratorT,
typename ValueOutputIteratorT,
typename NumSelectedIteratorT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
UniqueByKey(void *d_temp_storage,
size_t &temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
int num_items,
cudaStream_t stream,
bool debug_synchronous)
typename NumSelectedIteratorT,
typename NumItemsT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
void* d_temp_storage,
size_t& temp_storage_bytes,
KeyInputIteratorT d_keys_in,
ValueInputIteratorT d_values_in,
KeyOutputIteratorT d_keys_out,
ValueOutputIteratorT d_values_out,
NumSelectedIteratorT d_num_selected_out,
NumItemsT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

return UniqueByKey<KeyInputIteratorT,
ValueInputIteratorT,
KeyOutputIteratorT,
ValueOutputIteratorT,
NumSelectedIteratorT>(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
stream);
NumSelectedIteratorT,
NumItemsT>(
d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_values_in,
d_keys_out,
d_values_out,
d_num_selected_out,
num_items,
stream);
}
};

Expand Down
Loading