diff --git a/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/device/dispatch/dispatch_reduce_by_key.cuh index 694ae6fbd4..738eef63da 100644 --- a/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -372,11 +373,6 @@ struct DispatchReduceByKey * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. * - * @param[in] debug_synchronous - * Whether or not to synchronize the stream after every kernel launch to - * check for errors. Also causes launch configurations to be printed to - * the console. Default is `false`. - * * @param[in] ptx_version * PTX version of dispatch kernels * @@ -405,35 +401,11 @@ struct DispatchReduceByKey ReductionOpT reduction_op, OffsetT num_items, cudaStream_t stream, - bool debug_synchronous, int /*ptx_version*/, ScanInitKernelT init_kernel, ReduceByKeyKernelT reduce_by_key_kernel, KernelConfig reduce_by_key_config) { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_unique_out; - (void)d_values_in; - (void)d_aggregates_out; - (void)d_num_runs_out; - (void)equality_op; - (void)reduction_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)reduce_by_key_kernel; - (void)reduce_by_key_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; do { @@ -488,13 +460,13 @@ struct DispatchReduceByKey // Log init_kernel configuration int init_grid_size = CUB_MAX(1, cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS)); - if (debug_synchronous) - { - _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", - init_grid_size, - INIT_KERNEL_THREADS, - (long long)stream); - } + + #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG + _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", + init_grid_size, + INIT_KERNEL_THREADS, + (long long)stream); + #endif // Invoke init_kernel to initialize tile descriptors THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -511,7 +483,8 @@ struct DispatchReduceByKey } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) + error = detail::DebugSyncStream(stream); + if (CubDebug(error)) { break; } @@ -546,17 +519,16 @@ struct DispatchReduceByKey start_tile += scan_grid_size) { // Log reduce_by_key_kernel configuration - if (debug_synchronous) - { - _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " - "items per thread, %d SM occupancy\n", - start_tile, - scan_grid_size, - reduce_by_key_config.block_threads, - (long long)stream, - reduce_by_key_config.items_per_thread, - reduce_by_key_sm_occupancy); - } + #ifdef CUB_DETAIL_DEBUG_ENABLE_LOG + _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d " + "items per thread, %d SM occupancy\n", + start_tile, + scan_grid_size, + reduce_by_key_config.block_threads, + (long long)stream, + reduce_by_key_config.items_per_thread, + reduce_by_key_sm_occupancy); + #endif // Invoke reduce_by_key_kernel THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( @@ -583,7 +555,8 @@ struct DispatchReduceByKey } // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) + error = detail::DebugSyncStream(stream); + if (CubDebug(error)) { break; } @@ -591,8 +564,45 @@ struct DispatchReduceByKey } while (0); return error; + } -#endif // CUB_RUNTIME_ENABLED + template + CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + KeysInputIteratorT d_keys_in, + UniqueOutputIteratorT d_unique_out, + ValuesInputIteratorT d_values_in, + AggregatesOutputIteratorT d_aggregates_out, + NumRunsOutputIteratorT d_num_runs_out, + EqualityOpT equality_op, + ReductionOpT reduction_op, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version, + ScanInitKernelT init_kernel, + ReduceByKeyKernelT reduce_by_key_kernel, + KernelConfig reduce_by_key_config) + { + CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG + + return Dispatch(d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + equality_op, + reduction_op, + num_items, + stream, + ptx_version, + init_kernel, + reduce_by_key_kernel, + reduce_by_key_config); } /** @@ -633,11 +643,6 @@ struct DispatchReduceByKey * * @param[in] stream * CUDA stream to launch kernels within. Default is stream0. - * - * @param[in] debug_synchronous - * Whether or not to synchronize the stream after every kernel launch to - * check for errors. Also causes launch configurations to be printed to - * the console. Default is `false`. */ CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, @@ -650,8 +655,7 @@ struct DispatchReduceByKey EqualityOpT equality_op, ReductionOpT reduction_op, OffsetT num_items, - cudaStream_t stream, - bool debug_synchronous) + cudaStream_t stream) { cudaError error = cudaSuccess; @@ -682,7 +686,6 @@ struct DispatchReduceByKey reduction_op, num_items, stream, - debug_synchronous, ptx_version, DeviceCompactInitKernel, DeviceReduceByKeyKernel