Skip to content

Commit 50ce545

Browse files
authored
Refactors thrust::unique_by_key to use cub::DeviceSelect::UniqueByKey (#1245)
* adds copy assignment for hugetype to enable std algorithm * adds vsmem option to unique-by-key * move huge data type to c2h utilities * ports unique_by_key implementation to cub * adds tests for large problem counts to thrust * adds tests for custom equality op * adds sfinae to equality_op overload to avoid ambiguity * prevent loop unrolling to alleviate perf degradation for u32 * fixes signedness of comparison in tests * addresses review comments
1 parent b4d490b commit 50ce545

File tree

6 files changed

+676
-892
lines changed

6 files changed

+676
-892
lines changed

cub/cub/agent/agent_unique_by_key.cuh

+6-1
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,9 @@ struct AgentUniqueByKey
302302

303303
CTA_SYNC();
304304

305+
// Preventing loop unrolling helps avoid perf degradation when switching from signed to unsigned 32-bit offset
306+
// types
307+
#pragma unroll(1)
305308
for (int item = threadIdx.x;
306309
item < num_tile_selections;
307310
item += BLOCK_THREADS)
@@ -626,7 +629,9 @@ struct AgentUniqueByKey
626629
{
627630
// Blocks are launched in increasing order, so just assign one tile per block
628631
int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
629-
OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile
632+
633+
// Global offset for the current tile
634+
OffsetT tile_offset = static_cast<OffsetT>(tile_idx) * static_cast<OffsetT>(ITEMS_PER_TILE);
630635

631636
if (tile_idx < num_tiles - 1)
632637
{

cub/cub/device/device_select.cuh

+213-55
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,14 @@
4242
# pragma system_header
4343
#endif // no system header
4444

45-
#include <iterator>
46-
#include <stdio.h>
47-
45+
#include <cub/detail/choose_offset.cuh>
4846
#include <cub/device/dispatch/dispatch_select_if.cuh>
4947
#include <cub/device/dispatch/dispatch_unique_by_key.cuh>
5048
#include <cub/util_deprecated.cuh>
5149

50+
#include <iterator>
51+
#include <stdio.h>
52+
5253
CUB_NAMESPACE_BEGIN
5354

5455

@@ -837,6 +838,165 @@ struct DeviceSelect
837838
stream);
838839
}
839840

841+
//! @rst
842+
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
843+
//! equal-valued keys, only the first key and its value from each run is selectively copied
844+
//! to ``d_keys_out`` and ``d_values_out``.
845+
//! The total number of items selected is written to ``d_num_selected_out``.
846+
//!
847+
//! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent
848+
//! - Copies of the selected items are compacted into ``d_out`` and maintain
849+
//! their original relative ordering.
850+
//! - In-place operations are not supported. There must be no overlap between
851+
//! any of the provided ranges:
852+
//!
853+
//! - ``[d_keys_in, d_keys_in + num_items)``
854+
//! - ``[d_keys_out, d_keys_out + *d_num_selected_out)``
855+
//! - ``[d_values_in, d_values_in + num_items)``
856+
//! - ``[d_values_out, d_values_out + *d_num_selected_out)``
857+
//! - ``[d_num_selected_out, d_num_selected_out + 1)``
858+
//!
859+
//! - @devicestorage
860+
//!
861+
//! Snippet
862+
//! +++++++++++++++++++++++++++++++++++++++++++++
863+
//!
864+
//! The code snippet below illustrates the compaction of items selected from an ``int`` device vector.
865+
//!
866+
//! .. code-block:: c++
867+
//!
868+
//! #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh>
869+
//!
870+
//! // Declare, allocate, and initialize device-accessible pointers
871+
//! // for input and output
872+
//! int num_items; // e.g., 8
873+
//! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
874+
//! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
875+
//! int *d_keys_out; // e.g., [ , , , , , , , ]
876+
//! int *d_values_out; // e.g., [ , , , , , , , ]
877+
//! int *d_num_selected_out; // e.g., [ ]
878+
//! ...
879+
//!
880+
//! // Determine temporary device storage requirements
881+
//! void *d_temp_storage = NULL;
882+
//! size_t temp_storage_bytes = 0;
883+
//! cub::DeviceSelect::UniqueByKey(
884+
//! d_temp_storage, temp_storage_bytes,
885+
//! d_keys_in, d_values_in,
886+
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
887+
//!
888+
//! // Allocate temporary storage
889+
//! cudaMalloc(&d_temp_storage, temp_storage_bytes);
890+
//!
891+
//! // Run selection
892+
//! cub::DeviceSelect::UniqueByKey(
893+
//! d_temp_storage, temp_storage_bytes,
894+
//! d_keys_in, d_values_in,
895+
//! d_keys_out, d_values_out, d_num_selected_out, num_items);
896+
//!
897+
//! // d_keys_out <-- [0, 2, 9, 5, 8]
898+
//! // d_values_out <-- [1, 2, 4, 5, 8]
899+
//! // d_num_selected_out <-- [5]
900+
//!
901+
//! @endrst
902+
//!
903+
//! @tparam KeyInputIteratorT
904+
//! **[inferred]** Random-access input iterator type for reading input keys @iterator
905+
//!
906+
//! @tparam ValueInputIteratorT
907+
//! **[inferred]** Random-access input iterator type for reading input values @iterator
908+
//!
909+
//! @tparam KeyOutputIteratorT
910+
//! **[inferred]** Random-access output iterator type for writing selected keys @iterator
911+
//!
912+
//! @tparam ValueOutputIteratorT
913+
//! **[inferred]** Random-access output iterator type for writing selected values @iterator
914+
//!
915+
//! @tparam NumSelectedIteratorT
916+
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
917+
//!
918+
//! @tparam NumItemsT
919+
//! **[inferred]** Type of num_items
920+
//!
921+
//! @tparam EqualityOpT
922+
//! **[inferred]** Type of equality_op
923+
//!
924+
//! @param[in] d_temp_storage
925+
//! Device-accessible allocation of temporary storage. When `nullptr`, the
926+
//! required allocation size is written to `temp_storage_bytes` and no work is done.
927+
//!
928+
//! @param[in,out] temp_storage_bytes
929+
//! Reference to size in bytes of `d_temp_storage` allocation
930+
//!
931+
//! @param[in] d_keys_in
932+
//! Pointer to the input sequence of keys
933+
//!
934+
//! @param[in] d_values_in
935+
//! Pointer to the input sequence of values
936+
//!
937+
//! @param[out] d_keys_out
938+
//! Pointer to the output sequence of selected keys
939+
//!
940+
//! @param[out] d_values_out
941+
//! Pointer to the output sequence of selected values
942+
//!
943+
//! @param[out] d_num_selected_out
944+
//! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`)
945+
//!
946+
//! @param[in] num_items
947+
//! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`)
948+
//!
949+
//! @param[in] equality_op
950+
//! Binary predicate to determine equality
951+
//!
952+
//! @param[in] stream
953+
//! @rst
954+
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
955+
//! @endrst
956+
template <typename KeyInputIteratorT,
957+
typename ValueInputIteratorT,
958+
typename KeyOutputIteratorT,
959+
typename ValueOutputIteratorT,
960+
typename NumSelectedIteratorT,
961+
typename NumItemsT,
962+
typename EqualityOpT>
963+
CUB_RUNTIME_FUNCTION __forceinline__ static //
964+
typename ::cuda::std::enable_if< //
965+
!::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, //
966+
cudaError_t>::type
967+
UniqueByKey(
968+
void* d_temp_storage,
969+
size_t& temp_storage_bytes,
970+
KeyInputIteratorT d_keys_in,
971+
ValueInputIteratorT d_values_in,
972+
KeyOutputIteratorT d_keys_out,
973+
ValueOutputIteratorT d_values_out,
974+
NumSelectedIteratorT d_num_selected_out,
975+
NumItemsT num_items,
976+
EqualityOpT equality_op,
977+
cudaStream_t stream = 0)
978+
{
979+
using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type;
980+
981+
return DispatchUniqueByKey<
982+
KeyInputIteratorT,
983+
ValueInputIteratorT,
984+
KeyOutputIteratorT,
985+
ValueOutputIteratorT,
986+
NumSelectedIteratorT,
987+
EqualityOpT,
988+
OffsetT>::Dispatch(d_temp_storage,
989+
temp_storage_bytes,
990+
d_keys_in,
991+
d_values_in,
992+
d_keys_out,
993+
d_values_out,
994+
d_num_selected_out,
995+
equality_op,
996+
static_cast<OffsetT>(num_items),
997+
stream);
998+
}
999+
8401000
//! @rst
8411001
//! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
8421002
//! equal-valued keys, only the first key and its value from each run is selectively copied
@@ -914,6 +1074,9 @@ struct DeviceSelect
9141074
//! @tparam NumSelectedIteratorT
9151075
//! **[inferred]** Output iterator type for recording the number of items selected @iterator
9161076
//!
1077+
//! @tparam NumItemsT
1078+
//! **[inferred]** Type of num_items
1079+
//!
9171080
//! @param[in] d_temp_storage
9181081
//! Device-accessible allocation of temporary storage. When `nullptr`, the
9191082
//! required allocation size is written to `temp_storage_bytes` and no work is done.
@@ -947,72 +1110,67 @@ struct DeviceSelect
9471110
typename ValueInputIteratorT,
9481111
typename KeyOutputIteratorT,
9491112
typename ValueOutputIteratorT,
950-
typename NumSelectedIteratorT>
951-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
952-
UniqueByKey(void *d_temp_storage,
953-
size_t &temp_storage_bytes,
954-
KeyInputIteratorT d_keys_in,
955-
ValueInputIteratorT d_values_in,
956-
KeyOutputIteratorT d_keys_out,
957-
ValueOutputIteratorT d_values_out,
958-
NumSelectedIteratorT d_num_selected_out,
959-
int num_items,
960-
cudaStream_t stream = 0)
1113+
typename NumSelectedIteratorT,
1114+
typename NumItemsT>
1115+
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
1116+
void* d_temp_storage,
1117+
size_t& temp_storage_bytes,
1118+
KeyInputIteratorT d_keys_in,
1119+
ValueInputIteratorT d_values_in,
1120+
KeyOutputIteratorT d_keys_out,
1121+
ValueOutputIteratorT d_values_out,
1122+
NumSelectedIteratorT d_num_selected_out,
1123+
NumItemsT num_items,
1124+
cudaStream_t stream = 0)
9611125
{
962-
using OffsetT = int;
963-
using EqualityOp = Equality;
964-
965-
return DispatchUniqueByKey<KeyInputIteratorT,
966-
ValueInputIteratorT,
967-
KeyOutputIteratorT,
968-
ValueOutputIteratorT,
969-
NumSelectedIteratorT,
970-
EqualityOp,
971-
OffsetT>::Dispatch(d_temp_storage,
972-
temp_storage_bytes,
973-
d_keys_in,
974-
d_values_in,
975-
d_keys_out,
976-
d_values_out,
977-
d_num_selected_out,
978-
EqualityOp(),
979-
num_items,
980-
stream);
1126+
return UniqueByKey(
1127+
d_temp_storage,
1128+
temp_storage_bytes,
1129+
d_keys_in,
1130+
d_values_in,
1131+
d_keys_out,
1132+
d_values_out,
1133+
d_num_selected_out,
1134+
num_items,
1135+
Equality{},
1136+
stream);
9811137
}
9821138

9831139
template <typename KeyInputIteratorT,
9841140
typename ValueInputIteratorT,
9851141
typename KeyOutputIteratorT,
9861142
typename ValueOutputIteratorT,
987-
typename NumSelectedIteratorT>
988-
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
989-
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
990-
UniqueByKey(void *d_temp_storage,
991-
size_t &temp_storage_bytes,
992-
KeyInputIteratorT d_keys_in,
993-
ValueInputIteratorT d_values_in,
994-
KeyOutputIteratorT d_keys_out,
995-
ValueOutputIteratorT d_values_out,
996-
NumSelectedIteratorT d_num_selected_out,
997-
int num_items,
998-
cudaStream_t stream,
999-
bool debug_synchronous)
1143+
typename NumSelectedIteratorT,
1144+
typename NumItemsT>
1145+
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey(
1146+
void* d_temp_storage,
1147+
size_t& temp_storage_bytes,
1148+
KeyInputIteratorT d_keys_in,
1149+
ValueInputIteratorT d_values_in,
1150+
KeyOutputIteratorT d_keys_out,
1151+
ValueOutputIteratorT d_values_out,
1152+
NumSelectedIteratorT d_num_selected_out,
1153+
NumItemsT num_items,
1154+
cudaStream_t stream,
1155+
bool debug_synchronous)
10001156
{
10011157
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
10021158

10031159
return UniqueByKey<KeyInputIteratorT,
10041160
ValueInputIteratorT,
10051161
KeyOutputIteratorT,
10061162
ValueOutputIteratorT,
1007-
NumSelectedIteratorT>(d_temp_storage,
1008-
temp_storage_bytes,
1009-
d_keys_in,
1010-
d_values_in,
1011-
d_keys_out,
1012-
d_values_out,
1013-
d_num_selected_out,
1014-
num_items,
1015-
stream);
1163+
NumSelectedIteratorT,
1164+
NumItemsT>(
1165+
d_temp_storage,
1166+
temp_storage_bytes,
1167+
d_keys_in,
1168+
d_values_in,
1169+
d_keys_out,
1170+
d_values_out,
1171+
d_num_selected_out,
1172+
num_items,
1173+
stream);
10161174
}
10171175
};
10181176

0 commit comments

Comments
 (0)