Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Pull previous changes for reduce by key
Browse files Browse the repository at this point in the history
  • Loading branch information
gevtushenko committed Aug 2, 2022
1 parent 83cdc5d commit b200436
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 64 deletions.
149 changes: 91 additions & 58 deletions cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <cub/device/dispatch/dispatch_scan.cuh>
#include <cub/grid/grid_queue.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/util_deprecated.cuh>
#include <cub/util_device.cuh>
#include <cub/util_math.cuh>

Expand Down Expand Up @@ -372,11 +373,6 @@ struct DispatchReduceByKey
* @param[in] stream
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
*
* @param[in] debug_synchronous
* Whether or not to synchronize the stream after every kernel launch to
* check for errors. Also causes launch configurations to be printed to
* the console. Default is `false`.
*
* @param[in] ptx_version
* PTX version of dispatch kernels
*
Expand Down Expand Up @@ -405,35 +401,11 @@ struct DispatchReduceByKey
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous,
int /*ptx_version*/,
ScanInitKernelT init_kernel,
ReduceByKeyKernelT reduce_by_key_kernel,
KernelConfig reduce_by_key_config)
{

#ifndef CUB_RUNTIME_ENABLED
(void)d_temp_storage;
(void)temp_storage_bytes;
(void)d_keys_in;
(void)d_unique_out;
(void)d_values_in;
(void)d_aggregates_out;
(void)d_num_runs_out;
(void)equality_op;
(void)reduction_op;
(void)num_items;
(void)stream;
(void)debug_synchronous;
(void)init_kernel;
(void)reduce_by_key_kernel;
(void)reduce_by_key_config;

// Kernel launch not supported from this device
return CubDebug(cudaErrorNotSupported);

#else

cudaError error = cudaSuccess;
do
{
Expand Down Expand Up @@ -488,13 +460,13 @@ struct DispatchReduceByKey
// Log init_kernel configuration
int init_grid_size =
CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS));
if (debug_synchronous)
{
_CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n",
init_grid_size,
INIT_KERNEL_THREADS,
(long long)stream);
}

#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n",
init_grid_size,
INIT_KERNEL_THREADS,
(long long)stream);
#endif

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand All @@ -511,7 +483,8 @@ struct DispatchReduceByKey
}

// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream))))
error = detail::DebugSyncStream(stream);
if (CubDebug(error))
{
break;
}
Expand Down Expand Up @@ -546,17 +519,16 @@ struct DispatchReduceByKey
start_tile += scan_grid_size)
{
// Log reduce_by_key_kernel configuration
if (debug_synchronous)
{
_CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
"items per thread, %d SM occupancy\n",
start_tile,
scan_grid_size,
reduce_by_key_config.block_threads,
(long long)stream,
reduce_by_key_config.items_per_thread,
reduce_by_key_sm_occupancy);
}
#ifdef CUB_DETAIL_DEBUG_ENABLE_LOG
_CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d "
"items per thread, %d SM occupancy\n",
start_tile,
scan_grid_size,
reduce_by_key_config.block_threads,
(long long)stream,
reduce_by_key_config.items_per_thread,
reduce_by_key_sm_occupancy);
#endif

// Invoke reduce_by_key_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
Expand All @@ -583,16 +555,54 @@ struct DispatchReduceByKey
}

// Sync the stream if specified to flush runtime errors
if (debug_synchronous && (CubDebug(error = SyncStream(stream))))
error = detail::DebugSyncStream(stream);
if (CubDebug(error))
{
break;
}
}
} while (0);

return error;
}

#endif // CUB_RUNTIME_ENABLED
template <typename ScanInitKernelT, typename ReduceByKeyKernelT>
CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
Dispatch(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous,
int ptx_version,
ScanInitKernelT init_kernel,
ReduceByKeyKernelT reduce_by_key_kernel,
KernelConfig reduce_by_key_config)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

return Dispatch<ScanInitKernelT, ReduceByKeyKernelT>(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
equality_op,
reduction_op,
num_items,
stream,
ptx_version,
init_kernel,
reduce_by_key_kernel,
reduce_by_key_config);
}

/**
Expand Down Expand Up @@ -633,11 +643,6 @@ struct DispatchReduceByKey
*
* @param[in] stream
* CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
*
* @param[in] debug_synchronous
* Whether or not to synchronize the stream after every kernel launch to
* check for errors. Also causes launch configurations to be printed to
* the console. Default is `false`.
*/
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
Dispatch(void *d_temp_storage,
Expand All @@ -650,8 +655,7 @@ struct DispatchReduceByKey
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
cudaStream_t stream)
{
cudaError error = cudaSuccess;

Expand Down Expand Up @@ -682,7 +686,6 @@ struct DispatchReduceByKey
reduction_op,
num_items,
stream,
debug_synchronous,
ptx_version,
DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
DeviceReduceByKeyKernel<PtxReduceByKeyPolicy,
Expand All @@ -704,6 +707,36 @@ struct DispatchReduceByKey

return error;
}

CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
Dispatch(void *d_temp_storage,
size_t &temp_storage_bytes,
KeysInputIteratorT d_keys_in,
UniqueOutputIteratorT d_unique_out,
ValuesInputIteratorT d_values_in,
AggregatesOutputIteratorT d_aggregates_out,
NumRunsOutputIteratorT d_num_runs_out,
EqualityOpT equality_op,
ReductionOpT reduction_op,
OffsetT num_items,
cudaStream_t stream,
bool debug_synchronous)
{
CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG

return Dispatch(d_temp_storage,
temp_storage_bytes,
d_keys_in,
d_unique_out,
d_values_in,
d_aggregates_out,
d_num_runs_out,
equality_op,
reduction_op,
num_items,
stream);
}
};

CUB_NAMESPACE_END
Expand Down
8 changes: 2 additions & 6 deletions test/test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1152,9 +1152,7 @@ void TestAccumulatorTypes()
d_out,
cub::Sum{},
init,
num_items,
0,
true));
num_items));

CubDebugExit(
g_allocator.DeviceAllocate((void **)&d_temp_storage, temp_storage_bytes));
Expand All @@ -1166,9 +1164,7 @@ void TestAccumulatorTypes()
d_out,
cub::Sum{},
init,
num_items,
0,
true));
num_items));

int ok{};
CubDebugExit(cudaMemcpy(&ok, d_ok_count, sizeof(int), cudaMemcpyDeviceToHost));
Expand Down

0 comments on commit b200436

Please sign in to comment.