diff --git a/cub/cub/detail/choose_offset.cuh b/cub/cub/detail/choose_offset.cuh index ed703e26c9d..4154123973a 100644 --- a/cub/cub/detail/choose_offset.cuh +++ b/cub/cub/detail/choose_offset.cuh @@ -29,8 +29,10 @@ #include +#include +#include + #include -#include CUB_NAMESPACE_BEGIN @@ -45,18 +47,28 @@ template struct ChooseOffsetT { // NumItemsT must be an integral type (but not bool). - static_assert( - std::is_integral::value && - !std::is_same::type, bool>::value, - "NumItemsT must be an integral type, but not bool"); + static_assert(std::is_integral::value && + !std::is_same::type, bool>::value, + "NumItemsT must be an integral type, but not bool"); // Unsigned integer type for global offsets. - using Type = typename std::conditional::type; + using Type = + typename std::conditional::type; }; +/** + * common_iterator_value sets member type to the common_type of + * value_type for all argument types. used to get OffsetT in + * DeviceSegmentedReduce. + */ +template +struct common_iterator_value +{ + using type = ::cuda::std::__common_type_t<::cuda::std::__iter_value_type...>; +}; +template +using common_iterator_value_t = typename common_iterator_value::type; + } // namespace detail CUB_NAMESPACE_END - diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 52706f2b133..a776ac0f9e7 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,34 +27,34 @@ ******************************************************************************/ /** - * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations - * for computing a batched reduction across multiple sequences of data + * @file cub::DeviceSegmentedReduce provides device-wide, parallel operations + * for computing a batched reduction across multiple sequences of data * items residing within device-accessible memory. */ #pragma once -#include - #include +#include #include #include #include #include #include -CUB_NAMESPACE_BEGIN +#include +CUB_NAMESPACE_BEGIN /** - * @brief DeviceSegmentedReduce provides device-wide, parallel operations for - * computing a reduction across multiple sequences of data items + * @brief DeviceSegmentedReduce provides device-wide, parallel operations for + * computing a reduction across multiple sequences of data items * residing within device-accessible memory. ![](reduce_logo.png) * @ingroup SegmentedModule * * @par Overview - * A *reduction* - * (or *fold*) uses a binary combining operator to compute a single aggregate + * A *reduction* + * (or *fold*) uses a binary combining operator to compute a single aggregate * from a sequence of input elements. * * @par Usage Considerations @@ -64,7 +64,7 @@ CUB_NAMESPACE_BEGIN struct DeviceSegmentedReduce { /** - * @brief Computes a device-wide segmented reduction using the specified + * @brief Computes a device-wide segmented reduction using the specified * binary `reduction_op` functor. * * @par @@ -78,19 +78,19 @@ struct DeviceSegmentedReduce * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates a custom min-reduction of a device + * The code snippet below illustrates a custom min-reduction of a device * vector of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * * // CustomMin functor @@ -103,7 +103,7 @@ struct DeviceSegmentedReduce * } * }; * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -131,116 +131,114 @@ struct DeviceSegmentedReduce * // d_out <-- [6, INT_MAX, 0] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @tparam ReductionOp - * **[inferred]** Binary reduction functor type having member + * @tparam ReductionOpT + * **[inferred]** Binary reduction functor type having member * `T operator()(const T &a, const T &b)` * - * @tparam T - * **[inferred]** Data element type that is convertible to the `value` type + * @tparam T + * **[inferred]** Data element type that is convertible to the `value` type * of `InputIteratorT` * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no * work is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of \p d_temp_storage allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] reduction_op - * Binary reduction functor + * @param[in] reduction_op + * Binary reduction functor * - * @param[in] initial_value + * @param[in] initial_value * Initial value of the reduction for each segment * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - Reduce(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOp reduction_op, - T initial_value, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + T initial_value, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + using OffsetT = detail::common_iterator_value_t; return DispatchSegmentedReduce::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream); + ReductionOpT>::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream); } template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Reduce(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -248,7 +246,7 @@ struct DeviceSegmentedReduce int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, - ReductionOp reduction_op, + ReductionOpT reduction_op, T initial_value, cudaStream_t stream, bool debug_synchronous) @@ -259,7 +257,7 @@ struct DeviceSegmentedReduce OutputIteratorT, BeginOffsetIteratorT, EndOffsetIteratorT, - ReductionOp, + ReductionOpT, T>(d_temp_storage, temp_storage_bytes, d_in, @@ -273,7 +271,7 @@ struct DeviceSegmentedReduce } /** - * @brief Computes a device-wide segmented sum using the addition (`+`) + * @brief Computes a device-wide segmented sum using the addition (`+`) * operator. * * @par @@ -283,22 +281,22 @@ struct DeviceSegmentedReduce * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `+` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the sum reduction of a device vector of + * The code snippet below illustrates the sum reduction of a device vector of * `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -324,102 +322,98 @@ struct DeviceSegmentedReduce * // d_out <-- [21, 0, 17] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate * \iterator * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] stream - * **[optional] CUDA stream to launch kernels within. + * **[optional] CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - Sum(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + using OffsetT = detail::common_iterator_value_t; // The output value type using OutputT = - cub::detail::non_void_value_t>; + cub::detail::non_void_value_t>; - return DispatchSegmentedReduce< - InputIteratorT, - OutputIteratorT, - BeginOffsetIteratorT, - EndOffsetIteratorT, - OffsetT, - cub::Sum>::Dispatch(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Sum(), - OutputT(), // zero-initialize - stream); + return DispatchSegmentedReduce::Dispatch(d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream); } template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Sum(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -432,47 +426,45 @@ struct DeviceSegmentedReduce { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return Sum(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); + return Sum( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + stream); } /** - * @brief Computes a device-wide segmented minimum using the less-than + * @brief Computes a device-wide segmented minimum using the less-than * (`<`) operator. * * @par - * - Uses `std::numeric_limits::max()` as the initial value of the + * - Uses `std::numeric_limits::max()` as the initial value of the * reduction for each segment. * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter is * specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the min-reduction of a device vector of + * The code snippet below illustrates the min-reduction of a device vector of * `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -498,72 +490,71 @@ struct DeviceSegmentedReduce * // d_out <-- [6, INT_MAX, 0] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - Min(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + using OffsetT = detail::common_iterator_value_t; // The input value type using InputT = cub::detail::value_t; @@ -593,8 +584,7 @@ struct DeviceSegmentedReduce typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Min(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -607,53 +597,51 @@ struct DeviceSegmentedReduce { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return Min(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); + return Min( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + stream); } /** - * @brief Finds the first device-wide minimum in each segment using the - * less-than ('<') operator, also returning the in-segment index of + * @brief Finds the first device-wide minimum in each segment using the + * less-than ('<') operator, also returning the in-segment index of * that item. * * @par - * - The output value type of `d_out` is cub::KeyValuePair `` + * - The output value type of `d_out` is cub::KeyValuePair `` * (assuming the value type of `d_in` is `T`) - * - The minimum of the *i*th segment is written to - * `d_out[i].value` and its offset in that segment is written to + * - The minimum of the *i*th segment is written to + * `d_out[i].value` and its offset in that segment is written to * `d_out[i].key`. - * - The `{1, std::numeric_limits::max()}` tuple is produced for + * - The `{1, std::numeric_limits::max()}` tuple is produced for * zero-length inputs * - When input a contiguous sequence of segments, a single sequence - * `segment_offsets` (of length `num_segments + 1`) can be aliased for both - * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter + * `segment_offsets` (of length `num_segments + 1`) can be aliased for both + * the `d_begin_offsets` and `d_end_offsets` parameters (where the latter * is specified as `segment_offsets + 1`). * - Does not support `<` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector + * The code snippet below illustrates the argmin-reduction of a device vector * of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -679,80 +667,80 @@ struct DeviceSegmentedReduce * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `KeyValuePair`) \iterator * * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the * *i*th is considered empty. * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - ArgMin(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + // Using common iterator value type is a breaking change, see: + // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 + using OffsetT = int; // detail::common_iterator_value_t; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = - cub::detail::non_void_value_t>; + cub::detail::non_void_value_t>; // The output value type using OutputValueT = typename OutputTupleT::Value; @@ -762,14 +750,13 @@ struct DeviceSegmentedReduce using InitT = detail::reduce::empty_problem_init_t; // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; + using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Max())}; + InitT initial_value{AccumT(1, Traits::Max())}; return DispatchSegmentedReduce - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMin(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -808,47 +794,45 @@ struct DeviceSegmentedReduce { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return ArgMin(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); + return ArgMin( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + stream); } /** - * @brief Computes a device-wide segmented maximum using the greater-than + * @brief Computes a device-wide segmented maximum using the greater-than * (`>`) operator. * * @par - * - Uses `std::numeric_limits::lowest()` as the initial value of the + * - Uses `std::numeric_limits::lowest()` as the initial value of the * reduction. * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the max-reduction of a device vector + * The code snippet below illustrates the max-reduction of a device vector * of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -874,72 +858,71 @@ struct DeviceSegmentedReduce * // d_out <-- [8, INT_MIN, 9] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input * items \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced * aggregate \iterator * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - Max(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + using OffsetT = detail::common_iterator_value_t; // The input value type using InputT = cub::detail::value_t; @@ -969,8 +952,7 @@ struct DeviceSegmentedReduce typename OutputIteratorT, typename BeginOffsetIteratorT, typename EndOffsetIteratorT> - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t Max(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -983,53 +965,51 @@ struct DeviceSegmentedReduce { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return Max(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); + return Max( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + stream); } /** - * @brief Finds the first device-wide maximum in each segment using the - * greater-than ('>') operator, also returning the in-segment index of + * @brief Finds the first device-wide maximum in each segment using the + * greater-than ('>') operator, also returning the in-segment index of * that item * * @par - * - The output value type of `d_out` is `cub::KeyValuePair` + * - The output value type of `d_out` is `cub::KeyValuePair` * (assuming the value type of `d_in` is `T`) - * - The maximum of the *i*th segment is written to - * `d_out[i].value` and its offset in that segment is written to + * - The maximum of the *i*th segment is written to + * `d_out[i].value` and its offset in that segment is written to * `d_out[i].key`. - * - The `{1, std::numeric_limits::lowest()}` tuple is produced for + * - The `{1, std::numeric_limits::lowest()}` tuple is produced for * zero-length inputs * - When input a contiguous sequence of segments, a single sequence * `segment_offsets` (of length `num_segments + 1`) can be aliased * for both the `d_begin_offsets` and `d_end_offsets` parameters (where * the latter is specified as `segment_offsets + 1`). * - Does not support `>` operators that are non-commutative. - * - Let `s` be in `[0, num_segments)`. The range - * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not + * - Let `s` be in `[0, num_segments)`. The range + * `[d_out + d_begin_offsets[s], d_out + d_end_offsets[s])` shall not * overlap `[d_in + d_begin_offsets[s], d_in + d_end_offsets[s])`, * `[d_begin_offsets, d_begin_offsets + num_segments)` nor * `[d_end_offsets, d_end_offsets + num_segments)`. * - @devicestorage * * @par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector + * The code snippet below illustrates the argmax-reduction of a device vector * of `int` data elements. * @par * @code - * #include + * #include * // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers + * // Declare, allocate, and initialize device-accessible pointers * // for input and output * int num_segments; // e.g., 3 * int *d_offsets; // e.g., [0, 3, 3, 7] @@ -1055,80 +1035,80 @@ struct DeviceSegmentedReduce * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] * @endcode * - * @tparam InputIteratorT - * **[inferred]** Random-access input iterator type for reading input items + * @tparam InputIteratorT + * **[inferred]** Random-access input iterator type for reading input items * (of some type `T`) \iterator * - * @tparam OutputIteratorT - * **[inferred]** Output iterator type for recording the reduced aggregate + * @tparam OutputIteratorT + * **[inferred]** Output iterator type for recording the reduced aggregate * (having value type `KeyValuePair`) \iterator * - * @tparam BeginOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam BeginOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * beginning offsets \iterator * - * @tparam EndOffsetIteratorT - * **[inferred]** Random-access input iterator type for reading segment + * @tparam EndOffsetIteratorT + * **[inferred]** Random-access input iterator type for reading segment * ending offsets \iterator * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ template - CUB_RUNTIME_FUNCTION static cudaError_t - ArgMax(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - int num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - cudaStream_t stream = 0) + CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + cudaStream_t stream = 0) { - // Signed integer type for global offsets - using OffsetT = int; + // Integer type for global offsets + // Using common iterator value type is a breaking change, see: + // https://github.com/NVIDIA/cccl/pull/414#discussion_r1330632615 + using OffsetT = int; // detail::common_iterator_value_t; // The input type using InputValueT = cub::detail::value_t; // The output tuple type using OutputTupleT = - cub::detail::non_void_value_t>; + cub::detail::non_void_value_t>; using AccumT = OutputTupleT; @@ -1138,14 +1118,13 @@ struct DeviceSegmentedReduce using OutputValueT = typename OutputTupleT::Value; // Wrapped input iterator to produce index-value tuples - using ArgIndexInputIteratorT = - ArgIndexInputIterator; + using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value // TODO Address https://github.com/NVIDIA/cub/issues/651 - InitT initial_value{AccumT(1, Traits::Lowest())}; + InitT initial_value{AccumT(1, Traits::Lowest())}; return DispatchSegmentedReduce::Dispatch(d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMax(), - initial_value, - stream); + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream); } template - CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION static cudaError_t + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED CUB_RUNTIME_FUNCTION static cudaError_t ArgMax(void *d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, @@ -1184,20 +1162,16 @@ struct DeviceSegmentedReduce { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG - return ArgMax(d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - stream); + return ArgMax( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + stream); } }; CUB_NAMESPACE_END - - diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 2dbcdc76fdd..39a777c8e3d 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -13,9 +13,9 @@ * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; @@ -27,16 +27,13 @@ ******************************************************************************/ /** - * @file cub::DeviceReduce provides device-wide, parallel operations for - * computing a reduction across a sequence of data items residing within + * @file cub::DeviceReduce provides device-wide, parallel operations for + * computing a reduction across a sequence of data items residing within * device-accessible memory. */ #pragma once -#include -#include - #include #include #include @@ -50,6 +47,10 @@ #include +#include + +#include + CUB_NAMESPACE_BEGIN namespace detail @@ -110,42 +111,42 @@ __host__ __device__ void finalize_and_store_aggregate(OutputIteratorT d_out, *****************************************************************************/ /** - * @brief Reduce region kernel entry point (multi-block). Computes privatized + * @brief Reduce region kernel entry point (multi-block). Computes privatized * reductions, one per thread block. * - * @tparam ChainedPolicyT + * @tparam ChainedPolicyT * Chained tuning policy * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator - * + * * @tparam OffsetT * Signed integer type for global offsets - * + * * @tparam ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` - * + * * @tparam InitT * Initial value type - * + * * @tparam AccumT * Accumulator type - * - * @param[in] d_in + * + * @param[in] d_in * Pointer to the input sequence of data items - * - * @param[out] d_out + * + * @param[out] d_out * Pointer to the output aggregate - * - * @param[in] num_items + * + * @param[in] num_items * Total number of input data items - * - * @param[in] even_share - * Even-share descriptor for mapping an equal number of tiles onto each + * + * @param[in] even_share + * Even-share descriptor for mapping an equal number of tiles onto each * thread block - * - * @param[in] reduction_op + * + * @param[in] reduction_op * Binary reduction functor */ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceKernel(InputIteratorT d_in, - AccumT *d_out, - OffsetT num_items, - GridEvenShare even_share, - ReductionOpT reduction_op) +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) + void DeviceReduceKernel(InputIteratorT d_in, + AccumT *d_out, + OffsetT num_items, + GridEvenShare even_share, + ReductionOpT reduction_op) { // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; // Consume input tiles - AccumT block_aggregate = - AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); // Output result if (threadIdx.x == 0) @@ -184,8 +183,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) } /** - * @brief Reduce a single tile kernel entry point (single-block). Can be used - * to aggregate privatized thread block reductions from a previous + * @brief Reduce a single tile kernel entry point (single-block). Can be used + * to aggregate privatized thread block reductions from a previous * multi-block reduction pass. * * @tparam ChainedPolicyT @@ -196,33 +195,33 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) * * @tparam OutputIteratorT * Output iterator type for recording the reduced aggregate \iterator - * + * * @tparam OffsetT * Signed integer type for global offsets - * + * * @tparam ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` - * + * * @tparam InitT * Initial value type * * @tparam AccumT - * Accumulator type + * Accumulator type * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input data items * - * @param[in] reduction_op + * @param[in] reduction_op * Binary reduction functor * - * @param[in] init + * @param[in] init * The initial value of the reduction */ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) // - CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceSingleTileKernel(InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init) +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) + void DeviceReduceSingleTileKernel(InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init) { // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; @@ -263,8 +261,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THRE } // Consume input tiles - AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) - .ConsumeRange(OffsetT(0), num_items); + AccumT block_aggregate = + AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(OffsetT(0), num_items); // Output result if (threadIdx.x == 0) @@ -281,14 +279,11 @@ __device__ __forceinline__ void NormalizeReductionOutput(T & /*val*/, {} /// Normalize input iterator to segment offset (specialized for arg-index) -template -__device__ __forceinline__ void NormalizeReductionOutput( - KeyValuePairT &val, - OffsetT base_offset, - ArgIndexInputIterator /*itr*/) +template +__device__ __forceinline__ void +NormalizeReductionOutput(KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) { val.key -= base_offset; } @@ -297,7 +292,7 @@ __device__ __forceinline__ void NormalizeReductionOutput( * Segmented reduction (one block per segment) * @tparam ChainedPolicyT * Chained tuning policy - * + * * @tparam InputIteratorT * Random-access input iterator type for reading input items \iterator * @@ -305,48 +300,48 @@ __device__ __forceinline__ void NormalizeReductionOutput( * Output iterator type for recording the reduced aggregate \iterator * * @tparam BeginOffsetIteratorT - * Random-access input iterator type for reading segment beginning offsets + * Random-access input iterator type for reading segment beginning offsets * \iterator * * @tparam EndOffsetIteratorT - * Random-access input iterator type for reading segment ending offsets + * Random-access input iterator type for reading segment ending offsets * \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * * @tparam InitT * Initial value type * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first element + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first element * of the *i*th data segment in `d_keys_*` and `d_values_*` * - * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * - * @param[in] num_segments + * @param[in] num_segments * The number of segments that comprise the sorting data * - * @param[in] reduction_op + * @param[in] reduction_op * Binary reduction functor * - * @param[in] init + * @param[in] init * The initial value of the reduction */ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) - CUB_DETAIL_KERNEL_ATTRIBUTES +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) void DeviceSegmentedReduceKernel(InputIteratorT d_in, OutputIteratorT d_out, BeginOffsetIteratorT d_begin_offsets, @@ -369,13 +363,12 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) InitT init) { // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + using AgentReduceT = AgentReduce; // Shared memory storage __shared__ typename AgentReduceT::TempStorage temp_storage; @@ -394,8 +387,8 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) } // Consume input tiles - AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op) - .ConsumeRange(segment_begin, segment_end); + AccumT block_aggregate = + AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end); // Normalize as needed NormalizeReductionOutput(block_aggregate, segment_begin, d_in); @@ -421,13 +414,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) * Signed integer type for global offsets * * ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` */ -template < - typename AccumT, - typename OffsetT, - typename ReductionOpT> +template struct DeviceReducePolicy { //--------------------------------------------------------------------------- @@ -504,14 +494,12 @@ struct DeviceReducePolicy using MaxPolicy = Policy600; }; - - /****************************************************************************** * Single-problem dispatch *****************************************************************************/ /** - * @brief Utility class for dispatching the appropriately-tuned kernels for + * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT @@ -524,27 +512,21 @@ struct DeviceReducePolicy * Signed integer type for global offsets * * @tparam ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitT * Initial value type */ -template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetT, - typename ReductionOpT, - typename InitT = - cub::detail::non_void_value_t< - OutputIteratorT, - cub::detail::value_t>, - typename AccumT = - detail::accumulator_t< - ReductionOpT, - InitT, - cub::detail::value_t>, - typename SelectedPolicy = DeviceReducePolicy> +template >, + typename AccumT = + detail::accumulator_t>, + typename SelectedPolicy = DeviceReducePolicy> struct DispatchReduce : SelectedPolicy { //--------------------------------------------------------------------------- @@ -602,29 +584,28 @@ struct DispatchReduce : SelectedPolicy , init(init) , stream(stream) , ptx_version(ptx_version) - {} + {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchReduce(void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : d_temp_storage(d_temp_storage) - , temp_storage_bytes(temp_storage_bytes) - , d_in(d_in) - , d_out(d_out) - , num_items(num_items) - , reduction_op(reduction_op) - , init(init) - , stream(stream) - , ptx_version(ptx_version) + CUB_RUNTIME_FUNCTION __forceinline__ DispatchReduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_in(d_in) + , d_out(d_out) + , num_items(num_items) + , reduction_op(reduction_op) + , init(init) + , stream(stream) + , ptx_version(ptx_version) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG } @@ -643,7 +624,7 @@ struct DispatchReduce : SelectedPolicy * Function type of cub::DeviceReduceSingleTileKernel * * @param[in] single_tile_kernel - * Kernel function pointer to parameterization of + * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ template @@ -661,18 +642,21 @@ struct DispatchReduce : SelectedPolicy break; } - // Log single_reduce_sweep_kernel configuration - #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +// Log single_reduce_sweep_kernel configuration +#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - #endif +#endif // Invoke single_reduce_sweep_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( - 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream) + 1, + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + 0, + stream) .doit(single_tile_kernel, d_in, d_out, num_items, reduction_op, init); // Check for failure to launch @@ -711,16 +695,13 @@ struct DispatchReduce : SelectedPolicy * @param[in] reduce_kernel * Kernel function pointer to parameterization of cub::DeviceReduceKernel * - * @param[in] single_tile_kernel - * Kernel function pointer to parameterization of + * @param[in] single_tile_kernel + * Kernel function pointer to parameterization of * cub::DeviceReduceSingleTileKernel */ - template + template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t - InvokePasses(ReduceKernelT reduce_kernel, - SingleTileKernelT single_tile_kernel) + InvokePasses(ReduceKernelT reduce_kernel, SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do @@ -786,8 +767,8 @@ struct DispatchReduce : SelectedPolicy // Get grid size for device_reduce_sweep_kernel int reduce_grid_size = even_share.grid_size; - // Log device_reduce_sweep_kernel configuration - #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +// Log device_reduce_sweep_kernel configuration +#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items " "per thread, %d SM occupancy\n", reduce_grid_size, @@ -795,7 +776,7 @@ struct DispatchReduce : SelectedPolicy (long long)stream, ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, reduce_config.sm_occupancy); - #endif +#endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -803,12 +784,7 @@ struct DispatchReduce : SelectedPolicy ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream) - .doit(reduce_kernel, - d_in, - d_block_reductions, - num_items, - even_share, - reduction_op); + .doit(reduce_kernel, d_in, d_block_reductions, num_items, even_share, reduction_op); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); @@ -824,14 +800,14 @@ struct DispatchReduce : SelectedPolicy break; } - // Log single_reduce_sweep_kernel configuration - #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +// Log single_reduce_sweep_kernel configuration +#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), " "%d items per thread\n", ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long)stream, ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - #endif +#endif // Invoke DeviceReduceSingleTileKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -876,35 +852,33 @@ struct DispatchReduce : SelectedPolicy typedef typename DispatchReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * - SingleTilePolicyT::ITEMS_PER_THREAD)) + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) { // Small, single tile size - return InvokeSingleTile( - DeviceReduceSingleTileKernel); + return InvokeSingleTile(DeviceReduceSingleTileKernel); } else { // Regular size - return InvokePasses( - DeviceReduceKernel, - DeviceReduceSingleTileKernel); + return InvokePasses(DeviceReduceKernel, + DeviceReduceSingleTileKernel); } } @@ -915,42 +889,41 @@ struct DispatchReduce : SelectedPolicy /** * @brief Internal dispatch routine for computing a device-wide reduction * - * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * - * @param[in,out] temp_storage_bytes + * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * - * @param[in] d_in + * @param[in] d_in * Pointer to the input sequence of data items * - * @param[out] d_out + * @param[out] d_out * Pointer to the output aggregate * - * @param[in] num_items + * @param[in] num_items * Total number of input items (i.e., length of `d_in`) * - * @param[in] reduction_op + * @param[in] reduction_op * Binary reduction functor * - * @param[in] init + * @param[in] init * The initial value of the reduction * - * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * @param[in] stream + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ - CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t - Dispatch(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream) + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream) { typedef typename DispatchReduce::MaxPolicy MaxPolicyT; @@ -959,7 +932,7 @@ struct DispatchReduce : SelectedPolicy { // Get PTX version int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); + error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; @@ -988,16 +961,15 @@ struct DispatchReduce : SelectedPolicy } CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t - Dispatch(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous) + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous) { CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG @@ -1012,14 +984,12 @@ struct DispatchReduce : SelectedPolicy } }; - - /****************************************************************************** * Segmented dispatch *****************************************************************************/ /** - * @brief Utility class for dispatching the appropriately-tuned kernels for + * @brief Utility class for dispatching the appropriately-tuned kernels for * device-wide reduction * * @tparam InputIteratorT @@ -1029,40 +999,34 @@ struct DispatchReduce : SelectedPolicy * Output iterator type for recording the reduced aggregate \iterator * * @tparam BeginOffsetIteratorT - * Random-access input iterator type for reading segment beginning offsets + * Random-access input iterator type for reading segment beginning offsets * \iterator * * @tparam EndOffsetIteratorT - * Random-access input iterator type for reading segment ending offsets + * Random-access input iterator type for reading segment ending offsets * \iterator * * @tparam OffsetT * Signed integer type for global offsets * * @tparam ReductionOpT - * Binary reduction functor type having member + * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitT * value type */ -template < - typename InputIteratorT, - typename OutputIteratorT, - typename BeginOffsetIteratorT, - typename EndOffsetIteratorT, - typename OffsetT, - typename ReductionOpT, - typename InitT = - cub::detail::non_void_value_t< - OutputIteratorT, - cub::detail::value_t>, - typename AccumT = - detail::accumulator_t< - ReductionOpT, - InitT, - cub::detail::value_t>, - typename SelectedPolicy = DeviceReducePolicy> +template >, + typename AccumT = + detail::accumulator_t>, + typename SelectedPolicy = DeviceReducePolicy> struct DispatchSegmentedReduce : SelectedPolicy { //--------------------------------------------------------------------------- @@ -1084,7 +1048,7 @@ struct DispatchSegmentedReduce : SelectedPolicy OutputIteratorT d_out; /// The number of segments that comprise the sorting data - OffsetT num_segments; + int num_segments; /// Random-access input iterator to the sequence of beginning offsets of /// length `num_segments`, such that `d_begin_offsets[i]` is the first @@ -1115,18 +1079,17 @@ struct DispatchSegmentedReduce : SelectedPolicy //--------------------------------------------------------------------------- /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedReduce(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - int ptx_version) + CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) @@ -1141,19 +1104,18 @@ struct DispatchSegmentedReduce : SelectedPolicy {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedReduce(void *d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - InitT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) + CUB_RUNTIME_FUNCTION __forceinline__ DispatchSegmentedReduce(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + InitT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) : d_temp_storage(d_temp_storage) , temp_storage_bytes(temp_storage_bytes) , d_in(d_in) @@ -1183,7 +1145,7 @@ struct DispatchSegmentedReduce : SelectedPolicy * Function type of cub::DeviceSegmentedReduceKernel * * @param[in] segmented_reduce_kernel - * Kernel function pointer to parameterization of + * Kernel function pointer to parameterization of * cub::DeviceSegmentedReduceKernel */ template @@ -1211,8 +1173,8 @@ struct DispatchSegmentedReduce : SelectedPolicy break; } - // Log device_reduce_sweep_kernel configuration - #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG +// Log device_reduce_sweep_kernel configuration +#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), " "%d items per thread, %d SM occupancy\n", num_segments, @@ -1220,7 +1182,7 @@ struct DispatchSegmentedReduce : SelectedPolicy (long long)stream, ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, segmented_reduce_config.sm_occupancy); - #endif +#endif // Invoke DeviceReduceKernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -1262,16 +1224,15 @@ struct DispatchSegmentedReduce : SelectedPolicy typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedReduceKernel); + return InvokePasses(DeviceSegmentedReduceKernel); } //--------------------------------------------------------------------------- @@ -1282,8 +1243,8 @@ struct DispatchSegmentedReduce : SelectedPolicy * @brief Internal dispatch routine for computing a device-wide reduction * * @param[in] d_temp_storage - * Device-accessible allocation of temporary storage. When `nullptr`, the - * required allocation size is written to `temp_storage_bytes` and no work + * Device-accessible allocation of temporary storage. When `nullptr`, the + * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes @@ -1299,16 +1260,16 @@ struct DispatchSegmentedReduce : SelectedPolicy * The number of segments that comprise the sorting data * * @param[in] d_begin_offsets - * Random-access input iterator to the sequence of beginning offsets of - * length `num_segments`, such that `d_begin_offsets[i]` is the first - * element of the *i*th data segment in `d_keys_*` and + * Random-access input iterator to the sequence of beginning offsets of + * length `num_segments`, such that `d_begin_offsets[i]` is the first + * element of the *i*th data segment in `d_keys_*` and * `d_values_*` * * @param[in] d_end_offsets - * Random-access input iterator to the sequence of ending offsets of length - * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of - * the *i*th data segment in `d_keys_*` and `d_values_*`. - * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is + * Random-access input iterator to the sequence of ending offsets of length + * `num_segments`, such that `d_end_offsets[i] - 1` is the last element of + * the *i*th data segment in `d_keys_*` and `d_values_*`. + * If `d_end_offsets[i] - 1 <= d_begin_offsets[i]`, the *i*th is * considered empty. * * @param[in] reduction_op @@ -1318,7 +1279,7 @@ struct DispatchSegmentedReduce : SelectedPolicy * The initial value of the reduction * * @param[in] stream - * **[optional]** CUDA stream to launch kernels within. + * **[optional]** CUDA stream to launch kernels within. * Default is stream0. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t @@ -1346,7 +1307,7 @@ struct DispatchSegmentedReduce : SelectedPolicy { // Get PTX version int ptx_version = 0; - error = CubDebug(PtxVersion(ptx_version)); + error = CubDebug(PtxVersion(ptx_version)); if (cudaSuccess != error) { break; @@ -1405,6 +1366,4 @@ struct DispatchSegmentedReduce : SelectedPolicy } }; - CUB_NAMESPACE_END - diff --git a/cub/test/c2h/generators.cu b/cub/test/c2h/generators.cu index d86e5c73a03..02f46f578f8 100644 --- a/cub/test/c2h/generators.cu +++ b/cub/test/c2h/generators.cu @@ -452,10 +452,19 @@ gen_uniform_offsets(seed_t seed, T total_elements, T min_segment_size, T max_seg return segment_offsets; } +template thrust::device_vector gen_uniform_offsets(seed_t seed, + int32_t total_elements, + int32_t min_segment_size, + int32_t max_segment_size); + template thrust::device_vector gen_uniform_offsets(seed_t seed, uint32_t total_elements, uint32_t min_segment_size, uint32_t max_segment_size); +template thrust::device_vector gen_uniform_offsets(seed_t seed, + int64_t total_elements, + int64_t min_segment_size, + int64_t max_segment_size); template thrust::device_vector gen_uniform_offsets(seed_t seed, uint64_t total_elements, uint64_t min_segment_size, diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 98e6f5a2d01..7ced7fd29d6 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -78,14 +78,17 @@ type_pair // clang-format on #endif +using offsets = c2h::type_list; + CUB_TEST("Device reduce works with all device interfaces", "[segmented][reduce][device]", - full_type_list) + full_type_list, + offsets) { - using params = params_t; - using item_t = typename params::item_t; - using output_t = typename params::output_t; - using offset_t = uint32_t; + using type_pair_t = typename c2h::get<0, TestType>; + using input_t = typename type_pair_t::input_t; + using output_t = typename type_pair_t::output_t; + using offset_t = typename c2h::get<1, TestType>; constexpr int min_items = 1; constexpr int max_items = 1000000; @@ -115,7 +118,7 @@ CUB_TEST("Device reduce works with all device interfaces", auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); // Generate input data - thrust::device_vector in_items(num_items); + thrust::device_vector in_items(num_items); c2h::gen(CUB_SEED(2), in_items); auto d_in_it = thrust::raw_pointer_cast(in_items.data()); @@ -127,7 +130,7 @@ CUB_TEST("Device reduce works with all device interfaces", auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{}); // Prepare verification data - using accum_t = cub::detail::accumulator_t; + using accum_t = cub::detail::accumulator_t; thrust::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_items, segment_offsets, @@ -157,7 +160,7 @@ CUB_TEST("Device reduce works with all device interfaces", SECTION("sum") { using op_t = cub::Sum; - using accum_t = cub::detail::accumulator_t; + using accum_t = cub::detail::accumulator_t; // Prepare verification data thrust::host_vector expected_result(num_segments); @@ -186,7 +189,7 @@ CUB_TEST("Device reduce works with all device interfaces", compute_segmented_problem_reference(in_items, segment_offsets, op_t{}, - cub::NumericTraits::Max(), + cub::NumericTraits::Max(), expected_result.begin()); // Run test @@ -211,7 +214,7 @@ CUB_TEST("Device reduce works with all device interfaces", compute_segmented_problem_reference(in_items, segment_offsets, op_t{}, - cub::NumericTraits::Lowest(), + cub::NumericTraits::Lowest(), expected_result.begin()); // Run test @@ -236,7 +239,6 @@ CUB_TEST("Device reduce works with all device interfaces", compute_segmented_argmax_reference(in_items, segment_offsets, expected_result.begin()); // Run test - using result_t = cub::KeyValuePair; thrust::device_vector out_result(num_segments); device_segmented_arg_max(d_in_it, thrust::raw_pointer_cast(out_result.data()), @@ -253,7 +255,7 @@ CUB_TEST("Device reduce works with all device interfaces", using result_t = cub::KeyValuePair; // Prepare verification data - thrust::host_vector host_items(in_items); + thrust::host_vector host_items(in_items); thrust::host_vector expected_result(num_segments); compute_segmented_argmin_reference(in_items, segment_offsets, expected_result.begin()); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index 39fae9191a2..8215cf6fe1f 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -51,15 +51,17 @@ DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum); // List of types to test using custom_t = c2h::custom_type_t; using iterator_type_list = c2h::type_list, type_pair>; +using offsets = c2h::type_list; CUB_TEST("Device segmented reduce works with fancy input iterators", "[reduce][device]", - iterator_type_list) + iterator_type_list, + offsets) { - using params = params_t; - using item_t = typename params::item_t; - using output_t = typename params::output_t; - using offset_t = uint32_t; + using type_pair_t = typename c2h::get<0, TestType>; + using item_t = typename type_pair_t::input_t; + using output_t = typename type_pair_t::output_t; + using offset_t = typename c2h::get<1, TestType>; constexpr int min_items = 1; constexpr int max_items = 1000000; diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu new file mode 100644 index 00000000000..f39a0d50112 --- /dev/null +++ b/cub/test/catch2_test_device_segmented_reduce_iterators_64bit.cu @@ -0,0 +1,96 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#include + +#include +#include +#include + +#include + +// Has to go after all cub headers. Otherwise, this test won't catch unused +// variables in cub kernels. +#include "catch2/catch.hpp" +#include "catch2_test_cdp_helper.h" +#include "catch2_test_helper.h" + +DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Reduce, device_segmented_reduce); +DECLARE_CDP_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum); + +// %PARAM% TEST_CDP cdp 0:1 + +// List of types to test +using offsets = c2h::type_list; + +CUB_TEST("Device segmented reduce works with fancy input iterators and 64-bit offsets", + "[reduce][device]", + offsets) +{ + using offset_t = typename c2h::get<0, TestType>; + using op_t = cub::Sum; + + constexpr offset_t offset_zero = 0; + constexpr offset_t offset_one = 1; + constexpr offset_t iterator_value = 2; + constexpr offset_t min_items_per_segment = offset_one << 31; + constexpr offset_t max_items_per_segment = offset_one << 33; + + constexpr int num_segments = 2; + + // generate individual segment lengths and store cumulative sum in segment_offsets + const offset_t num_items_in_first_segment = + GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment))); + const offset_t num_items_in_second_segment = + GENERATE_COPY(take(2, random(min_items_per_segment, max_items_per_segment))); + thrust::device_vector segment_offsets = {offset_zero, + num_items_in_first_segment, + num_items_in_first_segment + + num_items_in_second_segment}; + + // store expected result and initialize device output container + thrust::host_vector expected_result = {iterator_value * num_items_in_first_segment, + iterator_value * num_items_in_second_segment}; + thrust::device_vector device_result(num_segments); + + // prepare device iterators + auto in_it = thrust::make_constant_iterator(iterator_value); + auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + auto d_out_it = thrust::raw_pointer_cast(device_result.data()); + + // reduce + device_segmented_reduce(in_it, + d_out_it, + num_segments, + d_offsets_it, + d_offsets_it + 1, + op_t{}, + offset_t{}); + + // verify result + REQUIRE(expected_result == device_result); +}