|
42 | 42 | # pragma system_header
|
43 | 43 | #endif // no system header
|
44 | 44 |
|
45 |
| -#include <iterator> |
46 |
| -#include <stdio.h> |
47 |
| - |
| 45 | +#include <cub/detail/choose_offset.cuh> |
48 | 46 | #include <cub/device/dispatch/dispatch_select_if.cuh>
|
49 | 47 | #include <cub/device/dispatch/dispatch_unique_by_key.cuh>
|
50 | 48 | #include <cub/util_deprecated.cuh>
|
51 | 49 |
|
| 50 | +#include <iterator> |
| 51 | +#include <stdio.h> |
| 52 | + |
52 | 53 | CUB_NAMESPACE_BEGIN
|
53 | 54 |
|
54 | 55 |
|
@@ -837,6 +838,165 @@ struct DeviceSelect
|
837 | 838 | stream);
|
838 | 839 | }
|
839 | 840 |
|
| 841 | + //! @rst |
| 842 | + //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive |
| 843 | + //! equal-valued keys, only the first key and its value from each run is selectively copied |
| 844 | + //! to ``d_keys_out`` and ``d_values_out``. |
| 845 | + //! The total number of items selected is written to ``d_num_selected_out``. |
| 846 | + //! |
| 847 | + //! - The user-provided equality operator, `equality_op`, is used to determine whether keys are equivalent |
| 848 | + //! - Copies of the selected items are compacted into ``d_out`` and maintain |
| 849 | + //! their original relative ordering. |
| 850 | + //! - In-place operations are not supported. There must be no overlap between |
| 851 | + //! any of the provided ranges: |
| 852 | + //! |
| 853 | + //! - ``[d_keys_in, d_keys_in + num_items)`` |
| 854 | + //! - ``[d_keys_out, d_keys_out + *d_num_selected_out)`` |
| 855 | + //! - ``[d_values_in, d_values_in + num_items)`` |
| 856 | + //! - ``[d_values_out, d_values_out + *d_num_selected_out)`` |
| 857 | + //! - ``[d_num_selected_out, d_num_selected_out + 1)`` |
| 858 | + //! |
| 859 | + //! - @devicestorage |
| 860 | + //! |
| 861 | + //! Snippet |
| 862 | + //! +++++++++++++++++++++++++++++++++++++++++++++ |
| 863 | + //! |
| 864 | + //! The code snippet below illustrates the compaction of items selected from an ``int`` device vector. |
| 865 | + //! |
| 866 | + //! .. code-block:: c++ |
| 867 | + //! |
| 868 | + //! #include <cub/cub.cuh> // or equivalently <cub/device/device_select.cuh> |
| 869 | + //! |
| 870 | + //! // Declare, allocate, and initialize device-accessible pointers |
| 871 | + //! // for input and output |
| 872 | + //! int num_items; // e.g., 8 |
| 873 | + //! int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] |
| 874 | + //! int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] |
| 875 | + //! int *d_keys_out; // e.g., [ , , , , , , , ] |
| 876 | + //! int *d_values_out; // e.g., [ , , , , , , , ] |
| 877 | + //! int *d_num_selected_out; // e.g., [ ] |
| 878 | + //! ... |
| 879 | + //! |
| 880 | + //! // Determine temporary device storage requirements |
| 881 | + //! void *d_temp_storage = NULL; |
| 882 | + //! size_t temp_storage_bytes = 0; |
| 883 | + //! cub::DeviceSelect::UniqueByKey( |
| 884 | + //! d_temp_storage, temp_storage_bytes, |
| 885 | + //! d_keys_in, d_values_in, |
| 886 | + //! d_keys_out, d_values_out, d_num_selected_out, num_items); |
| 887 | + //! |
| 888 | + //! // Allocate temporary storage |
| 889 | + //! cudaMalloc(&d_temp_storage, temp_storage_bytes); |
| 890 | + //! |
| 891 | + //! // Run selection |
| 892 | + //! cub::DeviceSelect::UniqueByKey( |
| 893 | + //! d_temp_storage, temp_storage_bytes, |
| 894 | + //! d_keys_in, d_values_in, |
| 895 | + //! d_keys_out, d_values_out, d_num_selected_out, num_items); |
| 896 | + //! |
| 897 | + //! // d_keys_out <-- [0, 2, 9, 5, 8] |
| 898 | + //! // d_values_out <-- [1, 2, 4, 5, 8] |
| 899 | + //! // d_num_selected_out <-- [5] |
| 900 | + //! |
| 901 | + //! @endrst |
| 902 | + //! |
| 903 | + //! @tparam KeyInputIteratorT |
| 904 | + //! **[inferred]** Random-access input iterator type for reading input keys @iterator |
| 905 | + //! |
| 906 | + //! @tparam ValueInputIteratorT |
| 907 | + //! **[inferred]** Random-access input iterator type for reading input values @iterator |
| 908 | + //! |
| 909 | + //! @tparam KeyOutputIteratorT |
| 910 | + //! **[inferred]** Random-access output iterator type for writing selected keys @iterator |
| 911 | + //! |
| 912 | + //! @tparam ValueOutputIteratorT |
| 913 | + //! **[inferred]** Random-access output iterator type for writing selected values @iterator |
| 914 | + //! |
| 915 | + //! @tparam NumSelectedIteratorT |
| 916 | + //! **[inferred]** Output iterator type for recording the number of items selected @iterator |
| 917 | + //! |
| 918 | + //! @tparam NumItemsT |
| 919 | + //! **[inferred]** Type of num_items |
| 920 | + //! |
| 921 | + //! @tparam EqualityOpT |
| 922 | + //! **[inferred]** Type of equality_op |
| 923 | + //! |
| 924 | + //! @param[in] d_temp_storage |
| 925 | + //! Device-accessible allocation of temporary storage. When `nullptr`, the |
| 926 | + //! required allocation size is written to `temp_storage_bytes` and no work is done. |
| 927 | + //! |
| 928 | + //! @param[in,out] temp_storage_bytes |
| 929 | + //! Reference to size in bytes of `d_temp_storage` allocation |
| 930 | + //! |
| 931 | + //! @param[in] d_keys_in |
| 932 | + //! Pointer to the input sequence of keys |
| 933 | + //! |
| 934 | + //! @param[in] d_values_in |
| 935 | + //! Pointer to the input sequence of values |
| 936 | + //! |
| 937 | + //! @param[out] d_keys_out |
| 938 | + //! Pointer to the output sequence of selected keys |
| 939 | + //! |
| 940 | + //! @param[out] d_values_out |
| 941 | + //! Pointer to the output sequence of selected values |
| 942 | + //! |
| 943 | + //! @param[out] d_num_selected_out |
| 944 | + //! Pointer to the total number of items selected (i.e., length of `d_keys_out` or `d_values_out`) |
| 945 | + //! |
| 946 | + //! @param[in] num_items |
| 947 | + //! Total number of input items (i.e., length of `d_keys_in` or `d_values_in`) |
| 948 | + //! |
| 949 | + //! @param[in] equality_op |
| 950 | + //! Binary predicate to determine equality |
| 951 | + //! |
| 952 | + //! @param[in] stream |
| 953 | + //! @rst |
| 954 | + //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. |
| 955 | + //! @endrst |
| 956 | + template <typename KeyInputIteratorT, |
| 957 | + typename ValueInputIteratorT, |
| 958 | + typename KeyOutputIteratorT, |
| 959 | + typename ValueOutputIteratorT, |
| 960 | + typename NumSelectedIteratorT, |
| 961 | + typename NumItemsT, |
| 962 | + typename EqualityOpT> |
| 963 | + CUB_RUNTIME_FUNCTION __forceinline__ static // |
| 964 | + typename ::cuda::std::enable_if< // |
| 965 | + !::cuda::std::is_convertible<EqualityOpT, cudaStream_t>::value, // |
| 966 | + cudaError_t>::type |
| 967 | + UniqueByKey( |
| 968 | + void* d_temp_storage, |
| 969 | + size_t& temp_storage_bytes, |
| 970 | + KeyInputIteratorT d_keys_in, |
| 971 | + ValueInputIteratorT d_values_in, |
| 972 | + KeyOutputIteratorT d_keys_out, |
| 973 | + ValueOutputIteratorT d_values_out, |
| 974 | + NumSelectedIteratorT d_num_selected_out, |
| 975 | + NumItemsT num_items, |
| 976 | + EqualityOpT equality_op, |
| 977 | + cudaStream_t stream = 0) |
| 978 | + { |
| 979 | + using OffsetT = typename detail::ChooseOffsetT<NumItemsT>::Type; |
| 980 | + |
| 981 | + return DispatchUniqueByKey< |
| 982 | + KeyInputIteratorT, |
| 983 | + ValueInputIteratorT, |
| 984 | + KeyOutputIteratorT, |
| 985 | + ValueOutputIteratorT, |
| 986 | + NumSelectedIteratorT, |
| 987 | + EqualityOpT, |
| 988 | + OffsetT>::Dispatch(d_temp_storage, |
| 989 | + temp_storage_bytes, |
| 990 | + d_keys_in, |
| 991 | + d_values_in, |
| 992 | + d_keys_out, |
| 993 | + d_values_out, |
| 994 | + d_num_selected_out, |
| 995 | + equality_op, |
| 996 | + static_cast<OffsetT>(num_items), |
| 997 | + stream); |
| 998 | + } |
| 999 | + |
840 | 1000 | //! @rst
|
841 | 1001 | //! Given an input sequence ``d_keys_in`` and ``d_values_in`` with runs of key-value pairs with consecutive
|
842 | 1002 | //! equal-valued keys, only the first key and its value from each run is selectively copied
|
@@ -914,6 +1074,9 @@ struct DeviceSelect
|
914 | 1074 | //! @tparam NumSelectedIteratorT
|
915 | 1075 | //! **[inferred]** Output iterator type for recording the number of items selected @iterator
|
916 | 1076 | //!
|
| 1077 | + //! @tparam NumItemsT |
| 1078 | + //! **[inferred]** Type of num_items |
| 1079 | + //! |
917 | 1080 | //! @param[in] d_temp_storage
|
918 | 1081 | //! Device-accessible allocation of temporary storage. When `nullptr`, the
|
919 | 1082 | //! required allocation size is written to `temp_storage_bytes` and no work is done.
|
@@ -947,72 +1110,67 @@ struct DeviceSelect
|
947 | 1110 | typename ValueInputIteratorT,
|
948 | 1111 | typename KeyOutputIteratorT,
|
949 | 1112 | typename ValueOutputIteratorT,
|
950 |
| - typename NumSelectedIteratorT> |
951 |
| - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t |
952 |
| - UniqueByKey(void *d_temp_storage, |
953 |
| - size_t &temp_storage_bytes, |
954 |
| - KeyInputIteratorT d_keys_in, |
955 |
| - ValueInputIteratorT d_values_in, |
956 |
| - KeyOutputIteratorT d_keys_out, |
957 |
| - ValueOutputIteratorT d_values_out, |
958 |
| - NumSelectedIteratorT d_num_selected_out, |
959 |
| - int num_items, |
960 |
| - cudaStream_t stream = 0) |
| 1113 | + typename NumSelectedIteratorT, |
| 1114 | + typename NumItemsT> |
| 1115 | + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( |
| 1116 | + void* d_temp_storage, |
| 1117 | + size_t& temp_storage_bytes, |
| 1118 | + KeyInputIteratorT d_keys_in, |
| 1119 | + ValueInputIteratorT d_values_in, |
| 1120 | + KeyOutputIteratorT d_keys_out, |
| 1121 | + ValueOutputIteratorT d_values_out, |
| 1122 | + NumSelectedIteratorT d_num_selected_out, |
| 1123 | + NumItemsT num_items, |
| 1124 | + cudaStream_t stream = 0) |
961 | 1125 | {
|
962 |
| - using OffsetT = int; |
963 |
| - using EqualityOp = Equality; |
964 |
| - |
965 |
| - return DispatchUniqueByKey<KeyInputIteratorT, |
966 |
| - ValueInputIteratorT, |
967 |
| - KeyOutputIteratorT, |
968 |
| - ValueOutputIteratorT, |
969 |
| - NumSelectedIteratorT, |
970 |
| - EqualityOp, |
971 |
| - OffsetT>::Dispatch(d_temp_storage, |
972 |
| - temp_storage_bytes, |
973 |
| - d_keys_in, |
974 |
| - d_values_in, |
975 |
| - d_keys_out, |
976 |
| - d_values_out, |
977 |
| - d_num_selected_out, |
978 |
| - EqualityOp(), |
979 |
| - num_items, |
980 |
| - stream); |
| 1126 | + return UniqueByKey( |
| 1127 | + d_temp_storage, |
| 1128 | + temp_storage_bytes, |
| 1129 | + d_keys_in, |
| 1130 | + d_values_in, |
| 1131 | + d_keys_out, |
| 1132 | + d_values_out, |
| 1133 | + d_num_selected_out, |
| 1134 | + num_items, |
| 1135 | + Equality{}, |
| 1136 | + stream); |
981 | 1137 | }
|
982 | 1138 |
|
983 | 1139 | template <typename KeyInputIteratorT,
|
984 | 1140 | typename ValueInputIteratorT,
|
985 | 1141 | typename KeyOutputIteratorT,
|
986 | 1142 | typename ValueOutputIteratorT,
|
987 |
| - typename NumSelectedIteratorT> |
988 |
| - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED |
989 |
| - CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t |
990 |
| - UniqueByKey(void *d_temp_storage, |
991 |
| - size_t &temp_storage_bytes, |
992 |
| - KeyInputIteratorT d_keys_in, |
993 |
| - ValueInputIteratorT d_values_in, |
994 |
| - KeyOutputIteratorT d_keys_out, |
995 |
| - ValueOutputIteratorT d_values_out, |
996 |
| - NumSelectedIteratorT d_num_selected_out, |
997 |
| - int num_items, |
998 |
| - cudaStream_t stream, |
999 |
| - bool debug_synchronous) |
| 1143 | + typename NumSelectedIteratorT, |
| 1144 | + typename NumItemsT> |
| 1145 | + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t UniqueByKey( |
| 1146 | + void* d_temp_storage, |
| 1147 | + size_t& temp_storage_bytes, |
| 1148 | + KeyInputIteratorT d_keys_in, |
| 1149 | + ValueInputIteratorT d_values_in, |
| 1150 | + KeyOutputIteratorT d_keys_out, |
| 1151 | + ValueOutputIteratorT d_values_out, |
| 1152 | + NumSelectedIteratorT d_num_selected_out, |
| 1153 | + NumItemsT num_items, |
| 1154 | + cudaStream_t stream, |
| 1155 | + bool debug_synchronous) |
1000 | 1156 | {
|
1001 | 1157 | CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG
|
1002 | 1158 |
|
1003 | 1159 | return UniqueByKey<KeyInputIteratorT,
|
1004 | 1160 | ValueInputIteratorT,
|
1005 | 1161 | KeyOutputIteratorT,
|
1006 | 1162 | ValueOutputIteratorT,
|
1007 |
| - NumSelectedIteratorT>(d_temp_storage, |
1008 |
| - temp_storage_bytes, |
1009 |
| - d_keys_in, |
1010 |
| - d_values_in, |
1011 |
| - d_keys_out, |
1012 |
| - d_values_out, |
1013 |
| - d_num_selected_out, |
1014 |
| - num_items, |
1015 |
| - stream); |
| 1163 | + NumSelectedIteratorT, |
| 1164 | + NumItemsT>( |
| 1165 | + d_temp_storage, |
| 1166 | + temp_storage_bytes, |
| 1167 | + d_keys_in, |
| 1168 | + d_values_in, |
| 1169 | + d_keys_out, |
| 1170 | + d_values_out, |
| 1171 | + d_num_selected_out, |
| 1172 | + num_items, |
| 1173 | + stream); |
1016 | 1174 | }
|
1017 | 1175 | };
|
1018 | 1176 |
|
|
0 commit comments