Skip to content

Commit 20c6ba5

Browse files
committed
Merge branch 'main' into inplace_vector
2 parents 222d8ad + 942f59f commit 20c6ba5

39 files changed

+647
-102
lines changed

cub/benchmarks/bench/scan/exclusive/base.cuh

+3-1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
#include <cub/device/device_scan.cuh>
2929

30+
#include <cuda/std/__functional/invoke.h>
31+
3032
#include <look_back_helper.cuh>
3133

3234
#if !TUNE_BASE
@@ -85,7 +87,7 @@ template <typename T, typename OffsetT>
8587
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
8688
{
8789
using init_t = cub::detail::InputValue<T>;
88-
using accum_t = cub::detail::accumulator_t<op_t, T, T>;
90+
using accum_t = ::cuda::std::__accumulator_t<op_t, T, T>;
8991
using input_it_t = const T*;
9092
using output_it_t = T*;
9193
using offset_t = OffsetT;

cub/benchmarks/bench/scan/exclusive/by_key.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ static void scan(nvbench::state& state, nvbench::type_list<KeyT, ValueT, OffsetT
7777
{
7878
using init_value_t = ValueT;
7979
using op_t = cub::Sum;
80-
using accum_t = cub::detail::accumulator_t<op_t, init_value_t, ValueT>;
80+
using accum_t = ::cuda::std::__accumulator_t<op_t, ValueT, init_value_t>;
8181
using key_input_it_t = const KeyT*;
8282
using val_input_it_t = const ValueT*;
8383
using val_output_it_t = ValueT*;

cub/cub/detail/type_traits.cuh

-4
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,5 @@ using invoke_result_t =
6262
::cuda::std::invoke_result_t<Invokable, Args...>;
6363
#endif
6464

65-
/// The type of intermediate accumulator (according to P2322R6)
66-
template <typename Invokable, typename InitT, typename InputT>
67-
using accumulator_t = typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
68-
6965
} // namespace detail
7066
CUB_NAMESPACE_END

cub/cub/device/device_run_length_encode.cuh

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434

3535
#include <cub/config.cuh>
3636

37+
#include <cuda/std/__functional/invoke.h>
38+
3739
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
3840
# pragma GCC system_header
3941
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -200,7 +202,7 @@ struct DeviceRunLengthEncode
200202
// Generator type for providing 1s values for run-length reduction
201203
using lengths_input_iterator_t = ConstantInputIterator<length_t, offset_t>;
202204

203-
using accum_t = detail::accumulator_t<reduction_op, length_t, length_t>;
205+
using accum_t = ::cuda::std::__accumulator_t<reduction_op, length_t, length_t>;
204206

205207
using key_t = cub::detail::non_void_value_t<UniqueOutputIteratorT, cub::detail::value_t<InputIteratorT>>;
206208

cub/cub/device/device_scan.cuh

+3-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#include <cub/thread/thread_operators.cuh>
5050
#include <cub/util_deprecated.cuh>
5151

52+
#include <cuda/std/__functional/invoke.h>
53+
5254
CUB_NAMESPACE_BEGIN
5355

5456
//! @rst
@@ -1303,7 +1305,7 @@ struct DeviceScan
13031305

13041306
// Unsigned integer type for global offsets
13051307
using OffsetT = detail::choose_offset_t<NumItemsT>;
1306-
using AccumT = cub::detail::accumulator_t<ScanOpT, InitValueT, cub::detail::value_t<InputIteratorT>>;
1308+
using AccumT = ::cuda::std::__accumulator_t<ScanOpT, cub::detail::value_t<InputIteratorT>, InitValueT>;
13071309
constexpr bool ForceInclusive = true;
13081310

13091311
return DispatchScan<

cub/cub/device/dispatch/dispatch_reduce.cuh

+12-13
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ template <typename InputIteratorT,
316316
typename OffsetT,
317317
typename ReductionOpT,
318318
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
319-
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
319+
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
320320
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>,
321321
typename TransformOpT = ::cuda::std::__identity>
322322
struct DispatchReduce : SelectedPolicy
@@ -797,17 +797,16 @@ struct DispatchReduce : SelectedPolicy
797797
* @tparam InitT
798798
* Initial value type
799799
*/
800-
template <typename InputIteratorT,
801-
typename OutputIteratorT,
802-
typename OffsetT,
803-
typename ReductionOpT,
804-
typename TransformOpT,
805-
typename InitT,
806-
typename AccumT =
807-
detail::accumulator_t<ReductionOpT, //
808-
InitT,
809-
cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>>,
810-
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
800+
template <
801+
typename InputIteratorT,
802+
typename OutputIteratorT,
803+
typename OffsetT,
804+
typename ReductionOpT,
805+
typename TransformOpT,
806+
typename InitT,
807+
typename AccumT = ::cuda::std::
808+
__accumulator_t<ReductionOpT, cub::detail::invoke_result_t<TransformOpT, cub::detail::value_t<InputIteratorT>>, InitT>,
809+
typename SelectedPolicyT = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
811810
using DispatchTransformReduce =
812811
DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, InitT, AccumT, SelectedPolicyT, TransformOpT>;
813812

@@ -850,7 +849,7 @@ template <typename InputIteratorT,
850849
typename OffsetT,
851850
typename ReductionOpT,
852851
typename InitT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::value_t<InputIteratorT>>,
853-
typename AccumT = detail::accumulator_t<ReductionOpT, InitT, cub::detail::value_t<InputIteratorT>>,
852+
typename AccumT = ::cuda::std::__accumulator_t<ReductionOpT, cub::detail::value_t<InputIteratorT>, InitT>,
854853
typename SelectedPolicy = DeviceReducePolicy<AccumT, OffsetT, ReductionOpT>>
855854
struct DispatchSegmentedReduce : SelectedPolicy
856855
{

cub/cub/device/dispatch/dispatch_reduce_by_key.cuh

+19-19
Original file line numberDiff line numberDiff line change
@@ -213,25 +213,25 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_TH
213213
* Implementation detail, do not specify directly, requirements on the
214214
* content of this type are subject to breaking change.
215215
*/
216-
template <
217-
typename KeysInputIteratorT,
218-
typename UniqueOutputIteratorT,
219-
typename ValuesInputIteratorT,
220-
typename AggregatesOutputIteratorT,
221-
typename NumRunsOutputIteratorT,
222-
typename EqualityOpT,
223-
typename ReductionOpT,
224-
typename OffsetT,
225-
typename AccumT = //
226-
detail::
227-
accumulator_t<ReductionOpT, cub::detail::value_t<ValuesInputIteratorT>, cub::detail::value_t<ValuesInputIteratorT>>,
228-
typename SelectedPolicy = //
229-
detail::device_reduce_by_key_policy_hub< //
230-
ReductionOpT, //
231-
AccumT, //
232-
cub::detail::non_void_value_t< //
233-
UniqueOutputIteratorT, //
234-
cub::detail::value_t<KeysInputIteratorT>>>>
216+
template <typename KeysInputIteratorT,
217+
typename UniqueOutputIteratorT,
218+
typename ValuesInputIteratorT,
219+
typename AggregatesOutputIteratorT,
220+
typename NumRunsOutputIteratorT,
221+
typename EqualityOpT,
222+
typename ReductionOpT,
223+
typename OffsetT,
224+
typename AccumT = //
225+
::cuda::std::__accumulator_t<ReductionOpT,
226+
cub::detail::value_t<ValuesInputIteratorT>,
227+
cub::detail::value_t<ValuesInputIteratorT>>,
228+
typename SelectedPolicy = //
229+
detail::device_reduce_by_key_policy_hub< //
230+
ReductionOpT, //
231+
AccumT, //
232+
cub::detail::non_void_value_t< //
233+
UniqueOutputIteratorT, //
234+
cub::detail::value_t<KeysInputIteratorT>>>>
235235
struct DispatchReduceByKey
236236
{
237237
//-------------------------------------------------------------------------

cub/cub/device/dispatch/dispatch_scan.cuh

+5-5
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,11 @@ template <typename InputIteratorT,
234234
typename ScanOpT,
235235
typename InitValueT,
236236
typename OffsetT,
237-
typename AccumT = detail::accumulator_t<ScanOpT,
238-
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
239-
cub::detail::value_t<InputIteratorT>,
240-
typename InitValueT::value_type>,
241-
cub::detail::value_t<InputIteratorT>>,
237+
typename AccumT = ::cuda::std::__accumulator_t<ScanOpT,
238+
cub::detail::value_t<InputIteratorT>,
239+
::cuda::std::_If<std::is_same<InitValueT, NullType>::value,
240+
cub::detail::value_t<InputIteratorT>,
241+
typename InitValueT::value_type>>,
242242
typename SelectedPolicy = DeviceScanPolicy<AccumT, ScanOpT>,
243243
bool ForceInclusive = false>
244244
struct DispatchScan : SelectedPolicy

cub/cub/device/dispatch/dispatch_scan_by_key.cuh

+3-3
Original file line numberDiff line numberDiff line change
@@ -228,10 +228,10 @@ template <
228228
typename ScanOpT,
229229
typename InitValueT,
230230
typename OffsetT,
231-
typename AccumT = detail::accumulator_t<
231+
typename AccumT = ::cuda::std::__accumulator_t<
232232
ScanOpT,
233-
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>,
234-
cub::detail::value_t<ValuesInputIteratorT>>,
233+
cub::detail::value_t<ValuesInputIteratorT>,
234+
::cuda::std::_If<std::is_same<InitValueT, NullType>::value, cub::detail::value_t<ValuesInputIteratorT>, InitValueT>>,
235235
typename SelectedPolicy =
236236
DeviceScanByKeyPolicy<KeysInputIteratorT, AccumT, cub::detail::value_t<ValuesInputIteratorT>, ScanOpT>>
237237
struct DispatchScanByKey : SelectedPolicy

cub/cub/thread/thread_reduce.cuh

+3-3
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ template <int LENGTH,
6868
typename T,
6969
typename ReductionOp,
7070
typename PrefixT,
71-
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
71+
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
7272
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT
7373
ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
7474
{
@@ -110,7 +110,7 @@ template <int LENGTH,
110110
typename T,
111111
typename ReductionOp,
112112
typename PrefixT,
113-
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
113+
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
114114
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T* input, ReductionOp reduction_op, PrefixT prefix)
115115
{
116116
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
@@ -170,7 +170,7 @@ template <int LENGTH,
170170
typename T,
171171
typename ReductionOp,
172172
typename PrefixT,
173-
typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
173+
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>>
174174
_CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op, PrefixT prefix)
175175
{
176176
return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());

cub/test/catch2_test_device_reduce.cu

+2-2
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
132132
auto reduction_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
133133

134134
// Prepare verification data
135-
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
135+
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
136136
output_t expected_result =
137137
static_cast<output_t>(compute_single_problem_reference(in_items, reduction_op, accum_t{}));
138138

@@ -152,7 +152,7 @@ CUB_TEST("Device reduce works with all device interfaces", "[reduce][device]", f
152152
SECTION("sum")
153153
{
154154
using op_t = cub::Sum;
155-
using accum_t = cub::detail::accumulator_t<op_t, output_t, item_t>;
155+
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, output_t>;
156156

157157
// Prepare verification data
158158
output_t expected_result = static_cast<output_t>(compute_single_problem_reference(in_items, op_t{}, accum_t{}));

cub/test/catch2_test_device_reduce_by_key.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ CUB_TEST("Device reduce-by-key works", "[by_key][reduce][device]", full_type_lis
116116
auto reduction_op = unwrap_op(reference_extended_fp(d_values_it), op_t{});
117117

118118
// Prepare verification data
119-
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
119+
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
120120
c2h::host_vector<output_t> expected_result(num_segments);
121121
compute_segmented_problem_reference(in_values, segment_offsets, reduction_op, accum_t{}, expected_result.begin());
122122
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);

cub/test/catch2_test_device_reduce_by_key_iterators.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ CUB_TEST("Device reduce-by-key works with iterators", "[by_key][reduce][device]"
9090
using op_t = cub::Sum;
9191

9292
// Prepare verification data
93-
using accum_t = cub::detail::accumulator_t<op_t, output_t, value_t>;
93+
using accum_t = ::cuda::std::__accumulator_t<op_t, value_t, output_t>;
9494
c2h::host_vector<output_t> expected_result(num_segments);
9595
compute_segmented_problem_reference(value_it, segment_offsets, op_t{}, accum_t{}, expected_result.begin());
9696
c2h::host_vector<key_t> expected_keys = compute_unique_keys_reference(segment_keys);

cub/test/catch2_test_device_reduce_iterators.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ CUB_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i
104104
auto reduction_op = op_t{};
105105

106106
// Prepare verification data
107-
using accum_t = cub::detail::accumulator_t<op_t, init_t, item_t>;
107+
using accum_t = ::cuda::std::__accumulator_t<op_t, item_t, init_t>;
108108
output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{});
109109

110110
// Run test

cub/test/catch2_test_device_scan.cu

+6-6
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
127127
SECTION("inclusive sum")
128128
{
129129
using op_t = cub::Sum;
130-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
130+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
131131

132132
// Prepare verification data
133133
c2h::host_vector<input_t> host_items(in_items);
@@ -155,7 +155,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
155155
SECTION("exclusive sum")
156156
{
157157
using op_t = cub::Sum;
158-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
158+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
159159

160160
// Prepare verification data
161161
c2h::host_vector<input_t> host_items(in_items);
@@ -184,7 +184,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
184184
SECTION("inclusive scan")
185185
{
186186
using op_t = cub::Min;
187-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
187+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
188188

189189
// Prepare verification data
190190
c2h::host_vector<input_t> host_items(in_items);
@@ -213,7 +213,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
213213
SECTION("inclusive scan with init value")
214214
{
215215
using op_t = cub::Sum;
216-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
216+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
217217

218218
// Scan operator
219219
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
@@ -248,7 +248,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
248248
SECTION("exclusive scan")
249249
{
250250
using op_t = cub::Sum;
251-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
251+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
252252

253253
// Scan operator
254254
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});
@@ -281,7 +281,7 @@ CUB_TEST("Device scan works with all device interfaces", "[scan][device]", full_
281281
SECTION("exclusive scan with future-init value")
282282
{
283283
using op_t = cub::Sum;
284-
using accum_t = cub::detail::accumulator_t<op_t, input_t, input_t>;
284+
using accum_t = ::cuda::std::__accumulator_t<op_t, input_t, input_t>;
285285

286286
// Scan operator
287287
auto scan_op = unwrap_op(reference_extended_fp(d_in_it), op_t{});

cub/test/catch2_test_device_scan.cuh

+4-4
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ template <typename InputIt, typename OutputIt, typename InitT, typename BinaryOp
6161
void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op)
6262
{
6363
using value_t = cub::detail::value_t<InputIt>;
64-
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
64+
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
6565
using output_t = cub::detail::value_t<OutputIt>;
6666
accum_t acc = static_cast<accum_t>(init);
6767
for (; first != last; ++first)
@@ -75,7 +75,7 @@ template <typename InputIt, typename OutputIt, typename BinaryOp, typename InitT
7575
void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init)
7676
{
7777
using value_t = cub::detail::value_t<InputIt>;
78-
using accum_t = cub::detail::accumulator_t<BinaryOp, InitT, value_t>;
78+
using accum_t = ::cuda::std::__accumulator_t<BinaryOp, value_t, InitT>;
7979
using output_t = cub::detail::value_t<OutputIt>;
8080
accum_t acc = static_cast<accum_t>(init);
8181
for (; first != last; ++first)
@@ -101,7 +101,7 @@ void compute_exclusive_scan_by_key_reference(
101101
std::size_t num_items)
102102
{
103103
using value_t = cub::detail::value_t<ValueInItT>;
104-
using accum_t = cub::detail::accumulator_t<ScanOpT, InitT, value_t>;
104+
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, InitT>;
105105
using output_t = cub::detail::value_t<ValuesOutItT>;
106106

107107
if (num_items > 0)
@@ -152,7 +152,7 @@ void compute_inclusive_scan_by_key_reference(
152152
std::size_t num_items)
153153
{
154154
using value_t = cub::detail::value_t<ValueInItT>;
155-
using accum_t = cub::detail::accumulator_t<ScanOpT, value_t, value_t>;
155+
using accum_t = ::cuda::std::__accumulator_t<ScanOpT, value_t, value_t>;
156156
using output_t = cub::detail::value_t<ValuesOutItT>;
157157

158158
for (std::size_t i = 0; i < num_items;)

0 commit comments

Comments
 (0)