From 97dcbe1aa6f06c068fb1be72274b07725595fcdb Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Wed, 11 Oct 2023 21:23:34 +0000 Subject: [PATCH] Refactor inline comments --- cub/cub/agent/agent_histogram.cuh | 320 ++- cub/cub/agent/agent_radix_sort_downsweep.cuh | 104 +- cub/cub/agent/agent_radix_sort_upsweep.cuh | 79 +- cub/cub/agent/agent_rle.cuh | 238 ++- cub/cub/agent/agent_segment_fixup.cuh | 236 ++- cub/cub/agent/agent_select_if.cuh | 360 ++-- cub/cub/agent/agent_spmv_orig.cuh | 277 ++- cub/cub/agent/agent_unique_by_key.cuh | 134 +- cub/cub/agent/single_pass_scan_operators.cuh | 253 ++- cub/cub/block/block_adjacent_difference.cuh | 339 ++-- cub/cub/block/block_discontinuity.cuh | 756 ++++--- cub/cub/block/block_exchange.cuh | 598 ++++-- cub/cub/block/block_histogram.cuh | 214 +- cub/cub/block/block_load.cuh | 1183 +++++++---- cub/cub/block/block_radix_rank.cuh | 278 ++- cub/cub/block/block_radix_sort.cuh | 469 +++-- cub/cub/block/block_raking_layout.cuh | 31 +- cub/cub/block/block_reduce.cuh | 335 ++-- cub/cub/block/block_run_length_decode.cuh | 94 +- cub/cub/block/block_scan.cuh | 1776 ++++++++++------- cub/cub/block/block_shuffle.cuh | 184 +- cub/cub/block/block_store.cuh | 769 ++++--- .../block_histogram_atomic.cuh | 43 +- .../specializations/block_histogram_sort.cuh | 65 +- .../specializations/block_reduce_raking.cuh | 129 +- .../block_reduce_raking_commutative_only.cuh | 91 +- .../block_reduce_warp_reductions.cuh | 148 +- .../specializations/block_scan_raking.cuh | 351 +++- .../specializations/block_scan_warp_scans.cuh | 343 +++- cub/cub/device/device_spmv.cuh | 103 +- .../device/dispatch/dispatch_radix_sort.cuh | 849 +++++--- .../device/dispatch/dispatch_spmv_orig.cuh | 330 ++- .../dispatch/dispatch_unique_by_key.cuh | 321 ++- cub/cub/grid/grid_even_share.cuh | 89 +- cub/cub/grid/grid_queue.cuh | 49 +- cub/cub/iterator/arg_index_input_iterator.cuh | 96 +- .../cache_modified_input_iterator.cuh | 62 +- .../cache_modified_output_iterator.cuh | 72 +- cub/cub/iterator/constant_input_iterator.cuh | 81 +- cub/cub/iterator/counting_input_iterator.cuh | 72 +- cub/cub/iterator/discard_output_iterator.cuh | 51 +- cub/cub/iterator/tex_obj_input_iterator.cuh | 71 +- cub/cub/iterator/tex_ref_input_iterator.cuh | 35 +- cub/cub/iterator/transform_input_iterator.cuh | 91 +- cub/cub/thread/thread_load.cuh | 40 +- cub/cub/thread/thread_reduce.cuh | 176 +- cub/cub/thread/thread_scan.cuh | 329 +-- cub/cub/thread/thread_search.cuh | 68 +- cub/cub/thread/thread_store.cuh | 41 +- cub/cub/util_allocator.cuh | 297 +-- cub/cub/util_device.cuh | 32 +- cub/cub/util_ptx.cuh | 174 +- cub/cub/util_temporary_storage.cuh | 30 +- .../warp/specializations/warp_reduce_shfl.cuh | 410 ++-- .../warp/specializations/warp_reduce_smem.cuh | 182 +- .../warp/specializations/warp_scan_shfl.cuh | 373 +++- .../warp/specializations/warp_scan_smem.cuh | 178 +- 57 files changed, 9832 insertions(+), 5067 deletions(-) diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh index c4226ba913e..44de7891d18 100644 --- a/cub/cub/agent/agent_histogram.cuh +++ b/cub/cub/agent/agent_histogram.cuh @@ -65,55 +65,122 @@ enum BlockHistogramMemoryPreference BLEND }; - /** * Parameterizable tuning policy type for AgentHistogram + * + * @tparam _BLOCK_THREADS + * Threads per thread block + * + * @tparam _PIXELS_PER_THREAD + * Pixels per thread (per tile of input) + * + * @tparam _LOAD_ALGORITHM + * The BlockLoad algorithm to use + * + * @tparam _LOAD_MODIFIER + * Cache load modifier for reading input elements + * + * @tparam _RLE_COMPRESS + * Whether to perform localized RLE to compress samples before histogramming + * + * @tparam _MEM_PREFERENCE + * Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + * + * @tparam _WORK_STEALING + * Whether to dequeue tiles from a global work queue + * + * @tparam _VEC_SIZE + * Vector size for samples loading (1, 2, 4) */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - bool _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue - int _VEC_SIZE = 4> ///< Vector size for samples loading (1, 2, 4) +template struct AgentHistogramPolicy { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue - }; + enum + { + /// Threads per thread block + BLOCK_THREADS = _BLOCK_THREADS, - static constexpr int VEC_SIZE = _VEC_SIZE; ///< Vector size for samples loading (1, 2, 4) + /// Pixels per thread (per tile of input) + PIXELS_PER_THREAD = _PIXELS_PER_THREAD, - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; + /// Whether to perform localized RLE to compress samples before histogramming + IS_RLE_COMPRESS = _RLE_COMPRESS, + + /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) + MEM_PREFERENCE = _MEM_PREFERENCE, + + /// Whether to dequeue tiles from a global work queue + IS_WORK_STEALING = _WORK_STEALING, + }; + /// Vector size for samples loading (1, 2, 4) + static constexpr int VEC_SIZE = _VEC_SIZE; + + ///< The BlockLoad algorithm to use + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + + ///< Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** - * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . + * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating + * in device-wide histogram . + * + * @tparam AgentHistogramPolicyT + * Parameterized AgentHistogramPolicy tuning policy type + * + * @tparam PRIVATIZED_SMEM_BINS + * Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized + * counters to be maintained in device-accessible memory. + * + * @tparam NUM_CHANNELS + * Number of channels interleaved in the input data. Supports up to four channels. + * + * @tparam NUM_ACTIVE_CHANNELS + * Number of channels actively being histogrammed + * + * @tparam SampleIteratorT + * Random-access input iterator type for reading samples + * + * @tparam CounterT + * Integer type for counting sample occurrences per histogram bin + * + * @tparam PrivatizedDecodeOpT + * The transform operator type for determining privatized counter indices from samples, one for + * each channel + * + * @tparam OutputDecodeOpT + * The transform operator type for determining output bin-ids from privatized counter indices, one + * for each channel + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam LEGACY_PTX_ARCH + * PTX compute capability (unused) */ -template < - typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type - int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. - int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< Random-access input iterator type for reading samples - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename OffsetT, ///< Signed integer type for global offsets - int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) +template struct AgentHistogram { //--------------------------------------------------------------------- @@ -198,16 +265,22 @@ struct AgentHistogram /// Shared memory type required by this thread block struct _TempStorage { - CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) + // Smem needed for block-privatized smem histogram (with 1 word of padding) + CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; int tile_idx; // Aliasable storage layout union Aliasable { - typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples - typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels - typename BlockLoadVecT::TempStorage vec_load; // Smem needed for loading a tile of vecs + // Smem needed for loading a tile of samples + typename BlockLoadSampleT::TempStorage sample_load; + + // Smem needed for loading a tile of pixels + typename BlockLoadPixelT::TempStorage pixel_load; + + // Smem needed for loading a tile of vecs + typename BlockLoadVecT::TempStorage vec_load; } aliasable; }; @@ -575,10 +648,16 @@ struct AgentHistogram // Tile processing //--------------------------------------------------------------------- - // Consume a tile of data samples - template < - bool IS_ALIGNED, // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel) - bool IS_FULL_TILE> // Whether the tile is full + /** + * @brief Consume a tile of data samples + * + * @tparam IS_ALIGNED + * Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel) + * + * @tparam IS_FULL_TILE + Whether the tile is full + */ + template __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) { SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; @@ -610,15 +689,28 @@ struct AgentHistogram } - // Consume row tiles. Specialized for work-stealing from queue + /** + * @brief Consume row tiles. Specialized for work-stealing from queue + * + * @param num_row_pixels + * The number of multi-channel pixels per row in the region of interest + * + * @param num_rows + * The number of rows in the region of interest + * + * @param row_stride_samples + * The number of samples between starts of consecutive rows in the region of interest + * + * @param tiles_per_row + * Number of image tiles per row + */ template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) + __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, + OffsetT num_rows, + OffsetT row_stride_samples, + int tiles_per_row, + GridQueue tile_queue, + Int2Type is_work_stealing) { int num_tiles = num_rows * tiles_per_row; @@ -658,15 +750,28 @@ struct AgentHistogram } - // Consume row tiles. Specialized for even-share (striped across thread blocks) + /** + * @brief Consume row tiles. Specialized for even-share (striped across thread blocks) + * + * @param num_row_pixels + * The number of multi-channel pixels per row in the region of interest + * + * @param num_rows + * The number of rows in the region of interest + * + * @param row_stride_samples + * The number of samples between starts of consecutive rows in the region of interest + * + * @param tiles_per_row + * Number of image tiles per row + */ template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) + __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, + OffsetT num_rows, + OffsetT row_stride_samples, + int tiles_per_row, + GridQueue tile_queue, + Int2Type is_work_stealing) { for (int row = blockIdx.y; row < num_rows; row += gridDim.y) { @@ -722,31 +827,53 @@ struct AgentHistogram /** - * Constructor + * @brief Constructor + * + * @param temp_storage + * Reference to temp_storage + * + * @param d_samples + * Input data to reduce + * + * @param num_output_bins + * The number bins per final output histogram + * + * @param num_privatized_bins + * The number bins per privatized histogram + * + * @param d_output_histograms + * Reference to final output histograms + * + * @param d_privatized_histograms + * Reference to privatized histograms + * + * @param output_decode_op + * The transform operator for determining output bin-ids from privatized counter indices, one for each channel + * + * @param privatized_decode_op + * The transform operator for determining privatized counter indices from samples, one for each channel */ - __device__ __forceinline__ AgentHistogram( - TempStorage &temp_storage, ///< Reference to temp_storage - SampleIteratorT d_samples, ///< Input data to reduce - int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms - CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel - : - temp_storage(temp_storage.Alias()), - d_wrapped_samples(d_samples), - d_native_samples(NativePointer(d_wrapped_samples)), - num_output_bins(num_output_bins), - num_privatized_bins(num_privatized_bins), - d_output_histograms(d_output_histograms), - output_decode_op(output_decode_op), - privatized_decode_op(privatized_decode_op), - prefer_smem((MEM_PREFERENCE == SMEM) ? - true : // prefer smem privatized histograms - (MEM_PREFERENCE == GMEM) ? - false : // prefer gmem privatized histograms - blockIdx.x & 1) // prefer blended privatized histograms + __device__ __forceinline__ + AgentHistogram(TempStorage &temp_storage, + SampleIteratorT d_samples, + int (&num_output_bins)[NUM_ACTIVE_CHANNELS], + int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], + CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS], + CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], + OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], + PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) + : temp_storage(temp_storage.Alias()) + , d_wrapped_samples(d_samples) + , d_native_samples(NativePointer(d_wrapped_samples)) + , num_output_bins(num_output_bins) + , num_privatized_bins(num_privatized_bins) + , d_output_histograms(d_output_histograms) + , output_decode_op(output_decode_op) + , privatized_decode_op(privatized_decode_op) + , prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms + (MEM_PREFERENCE == GMEM) ? false + : // prefer gmem privatized histograms + blockIdx.x & 1) // prefer blended privatized histograms { int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; @@ -755,16 +882,29 @@ struct AgentHistogram this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); } - /** - * Consume image + * @brief Consume image + * + * @param num_row_pixels + * The number of multi-channel pixels per row in the region of interest + * + * @param num_rows + * The number of rows in the region of interest + * + * @param row_stride_samples + * The number of samples between starts of consecutive rows in the region of interest + * + * @param tiles_per_row + * Number of image tiles per row + * + * @param tile_queue + * Queue descriptor for assigning tiles of work to thread blocks */ - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks + __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels, + OffsetT num_rows, + OffsetT row_stride_samples, + int tiles_per_row, + GridQueue tile_queue) { // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel) int vec_mask = AlignBytes::ALIGN_BYTES - 1; diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh index a28bbdfa597..d893d836ee1 100644 --- a/cub/cub/agent/agent_radix_sort_downsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh @@ -63,30 +63,61 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * Parameterizable tuning policy type for AgentRadixSortDownsweep + * @brief Parameterizable tuning policy type for AgentRadixSortDownsweep + * + * @tparam NOMINAL_BLOCK_THREADS_4B + * Threads per thread block + * + * @tparam NOMINAL_ITEMS_PER_THREAD_4B + * Items per thread (per tile of input) + * + * @tparam ComputeT + * Dominant compute type + * + * @tparam _LOAD_ALGORITHM + * The BlockLoad algorithm to use + * + * @tparam _LOAD_MODIFIER + * Cache load modifier for reading keys (and values) + * + * @tparam _RANK_ALGORITHM + * The radix ranking algorithm to use + * + * @tparam _SCAN_ALGORITHM + * The block scan algorithm to use + * + * @tparam _RADIX_BITS + * The number of radix bits, i.e., log2(bins) */ -template < - int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block - int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input) - typename ComputeT, ///< Dominant compute type - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) - RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use - BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use - int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - typename ScalingType = RegBoundScaling > -struct AgentRadixSortDownsweepPolicy : - ScalingType +template > +struct AgentRadixSortDownsweepPolicy : ScalingType { - enum - { - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; + enum + { + /// The number of radix bits, i.e., log2(bins) + RADIX_BITS = _RADIX_BITS, + }; + + /// The BlockLoad algorithm to use + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + + /// Cache load modifier for reading keys (and values) + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) - static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + /// The radix ranking algorithm to use + static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; + + /// The BlockScan algorithm to use + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; }; @@ -99,15 +130,30 @@ struct AgentRadixSortDownsweepPolicy : /** - * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . + * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in + * device-wide radix sort downsweep . + * + * @tparam AgentRadixSortDownsweepPolicy + * Parameterized AgentRadixSortDownsweepPolicy tuning policy type + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * KeyT type + * + * @tparam ValueT + * ValueT type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< KeyT type - typename ValueT, ///< ValueT type - typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> +template struct AgentRadixSortDownsweep { //--------------------------------------------------------------------- diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh index da048835da9..c21f1b41baa 100644 --- a/cub/cub/agent/agent_radix_sort_upsweep.cuh +++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh @@ -56,39 +56,63 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * Parameterizable tuning policy type for AgentRadixSortUpsweep + * @brief Parameterizable tuning policy type for AgentRadixSortUpsweep + * + * @tparam NOMINAL_BLOCK_THREADS_4B + * Threads per thread block + * + * @tparam NOMINAL_ITEMS_PER_THREAD_4B + * Items per thread (per tile of input) + * + * @tparam ComputeT + * Dominant compute type + * + * @tparam _LOAD_MODIFIER + * Cache load modifier for reading keys + * + * @tparam _RADIX_BITS + * The number of radix bits, i.e., log2(bins) */ -template < - int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block - int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input) - typename ComputeT, ///< Dominant compute type - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys - int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - typename ScalingType = RegBoundScaling > -struct AgentRadixSortUpsweepPolicy : - ScalingType +template > +struct AgentRadixSortUpsweepPolicy : ScalingType { - enum - { - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; - - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys + enum + { + /// The number of radix bits, i.e., log2(bins) + RADIX_BITS = _RADIX_BITS, + }; + + /// Cache load modifier for reading keys + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; }; - /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** - * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . + * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for + * participating in device-wide radix sort upsweep . + * + * @tparam AgentRadixSortUpsweepPolicy + * Parameterized AgentRadixSortUpsweepPolicy tuning policy type + * + * @tparam KeyT + * KeyT type + * + * @tparam DecomposerT = detail::identity_decomposer_t + * Signed integer type for global offsets */ -template < - typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type - typename KeyT, ///< KeyT type - typename OffsetT, - typename DecomposerT = detail::identity_decomposer_t> ///< Signed integer type for global offsets +template struct AgentRadixSortUpsweep { @@ -483,11 +507,14 @@ struct AgentRadixSortUpsweep /** - * Extract counts + * @brief Extract counts + * + * @param[out] bin_count + * The exclusive prefix sum for the digits + * [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ template - __device__ __forceinline__ void ExtractCounts( - OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + __device__ __forceinline__ void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) { unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; unsigned int warp_tid = LaneId(); diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index 15f27bd4c53..733dea60020 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -96,41 +96,66 @@ template > struct AgentRlePolicy { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - }; + enum + { + /// Threads per thread block + BLOCK_THREADS = _BLOCK_THREADS, - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + /// Items per thread (per tile of input) + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - struct detail - { - using delay_constructor_t = DelayConstructorT; - }; -}; + /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced + /// among block-warps during any store-related data transpositions (versus each warp having its + /// own storage) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, + }; + /// The BlockLoad algorithm to use + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + /// Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + /// The BlockScan algorithm to use + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + struct detail + { + using delay_constructor_t = DelayConstructorT; + }; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** - * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode + * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode + * + * @tparam AgentRlePolicyT + * Parameterized AgentRlePolicyT tuning policy type + * + * @tparam InputIteratorT + * Random-access input iterator type for data + * + * @tparam OffsetsOutputIteratorT + * Random-access output iterator type for offset values + * + * @tparam LengthsOutputIteratorT + * Random-access output iterator type for length values + * + * @tparam EqualityOpT + * T equality operator type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for data - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values - typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets +template struct AgentRle { //--------------------------------------------------------------------- @@ -153,22 +178,23 @@ struct AgentRle // Constants enum { - WARP_THREADS = CUB_WARP_THREADS(0), - BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, - WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - /// Whether or not to sync after loading data - SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), - - /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, - ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, + WARP_THREADS = CUB_WARP_THREADS(0), + BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, + WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// Whether or not to sync after loading data + SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced + /// among block-warps during any store-related data transpositions (versus each warp having + /// its own storage) + STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, + ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, }; - /** * Special operator that signals all out-of-bounds items are not equal to everything else, * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked @@ -248,10 +274,17 @@ struct AgentRle { struct ScanStorage { - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans - Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + // Smem needed for discontinuity detection + typename BlockDiscontinuityT::TempStorage discontinuity; + + // Smem needed for warp-synchronous scans + typename WarpScanPairs::TempStorage warp_scan[WARPS]; + + // Smem needed for sharing warp-wide aggregates + Uninitialized warp_aggregates; + + // Smem needed for cooperative prefix callback + typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for input loading @@ -268,9 +301,9 @@ struct AgentRle } aliasable; - OffsetT tile_idx; // Shared tile index - LengthOffsetPair tile_inclusive; // Inclusive tile prefix - LengthOffsetPair tile_exclusive; // Exclusive tile prefix + OffsetT tile_idx; // Shared tile index + LengthOffsetPair tile_inclusive; // Inclusive tile prefix + LengthOffsetPair tile_exclusive; // Exclusive tile prefix }; // Alias wrapper allowing storage to be unioned @@ -281,41 +314,54 @@ struct AgentRle // Per-thread fields //--------------------------------------------------------------------- - _TempStorage& temp_storage; ///< Reference to temp_storage - - WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets - LengthsOutputIteratorT d_lengths_out; ///< Output run lengths + _TempStorage &temp_storage; ///< Reference to temp_storage - EqualityOpT equality_op; ///< T equality operator - ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator - OffsetT num_items; ///< Total number of input items + WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets + LengthsOutputIteratorT d_lengths_out; ///< Output run lengths + EqualityOpT equality_op; ///< T equality operator + ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator + OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- - // Constructor - __device__ __forceinline__ - AgentRle( - TempStorage &temp_storage, ///< [in] Reference to temp_storage - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths - EqualityOpT equality_op, ///< [in] T equality operator - OffsetT num_items) ///< [in] Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_offsets_out(d_offsets_out), - d_lengths_out(d_lengths_out), - equality_op(equality_op), - scan_op(cub::Sum()), - num_items(num_items) + /** + * @param[in] temp_storage + * Reference to temp_storage + * + * @param[in] d_in + * Pointer to input sequence of data items + * + * @param[out] d_offsets_out + * Pointer to output sequence of run offsets + * + * @param[out] d_lengths_out + * Pointer to output sequence of run lengths + * + * @param[in] equality_op + * Equality operator + * + * @param[in] num_items + * Total number of input items + */ + __device__ __forceinline__ AgentRle(TempStorage &temp_storage, + InputIteratorT d_in, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + EqualityOpT equality_op, + OffsetT num_items) + : temp_storage(temp_storage.Alias()) + , d_in(d_in) + , d_offsets_out(d_offsets_out) + , d_lengths_out(d_lengths_out) + , equality_op(equality_op) + , scan_op(cub::Sum()) + , num_items(num_items) {} - //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- @@ -683,16 +729,29 @@ struct AgentRle //--------------------------------------------------------------------- /** - * Process a tile of input (dynamic chained scan) + * @brief Process a tile of input (dynamic chained scan) + * + * @param num_items + * Total number of global input items + * + * @param num_remaining + * Number of global input items remaining (including this tile) + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param &tile_status + * Global list of tile status */ - template < - bool LAST_TILE> - __device__ __forceinline__ LengthOffsetPair ConsumeTile( - OffsetT num_items, ///< Total number of global input items - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT &tile_status) ///< Global list of tile status + template + __device__ __forceinline__ LengthOffsetPair ConsumeTile(OffsetT num_items, + OffsetT num_remaining, + int tile_idx, + OffsetT tile_offset, + ScanTileStateT &tile_status) { if (tile_idx == 0) { @@ -892,13 +951,24 @@ struct AgentRle /** - * Scan tiles of items as part of a dynamic chained scan + * @brief Scan tiles of items as part of a dynamic chained scan + * + * @param num_tiles + * Total number of input tiles + * + * @param tile_status + * Global list of tile status + * + * @param d_num_runs_out + * Output pointer for total number of runs identified + * + * @tparam NumRunsIteratorT + * Output iterator type for recording number of items selected */ - template ///< Output iterator type for recording number of items selected - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_status, ///< Global list of tile status - NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified + template + __device__ __forceinline__ void ConsumeRange(int num_tiles, + ScanTileStateT &tile_status, + NumRunsIteratorT d_num_runs_out) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh index 84d8af5aa6a..205970eaef3 100644 --- a/cub/cub/agent/agent_segment_fixup.cuh +++ b/cub/cub/agent/agent_segment_fixup.cuh @@ -59,42 +59,81 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * Parameterizable tuning policy type for AgentSegmentFixup + * @brief Parameterizable tuning policy type for AgentSegmentFixup + * + * @tparam _BLOCK_THREADS + * Threads per thread block + * + * @tparam _ITEMS_PER_THREAD + * Items per thread (per tile of input) + * + * @tparam _LOAD_ALGORITHM + * The BlockLoad algorithm to use + * + * @tparam _LOAD_MODIFIER + * Cache load modifier for reading input elements + * + * @tparam _SCAN_ALGORITHM + * The BlockScan algorithm to use */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +template struct AgentSegmentFixupPolicy { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; + enum + { + /// Threads per thread block + BLOCK_THREADS = _BLOCK_THREADS, - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; + /// Items per thread (per tile of input) + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + }; + + /// The BlockLoad algorithm to use + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + /// Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + + /// The BlockScan algorithm to use + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ /** - * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key + * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for + * participating in device-wide reduce-value-by-key + * + * @tparam AgentSegmentFixupPolicyT + * Parameterized AgentSegmentFixupPolicy tuning policy type + * + * @tparam PairsInputIteratorT + * Random-access input iterator type for keys + * + * @tparam AggregatesOutputIteratorT + * Random-access output iterator type for values + * + * @tparam EqualityOpT + * KeyT equality operator type + * + * @tparam ReductionOpT + * ValueT reduction operator type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets +template struct AgentSegmentFixup { //--------------------------------------------------------------------- @@ -172,8 +211,11 @@ struct AgentSegmentFixup { struct ScanStorage { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback + // Smem needed for tile scanning + typename BlockScanT::TempStorage scan; + + // Smem needed for cooperative prefix callback + typename TilePrefixCallbackOpT::TempStorage prefix; } scan_storage; // Smem needed for loading keys @@ -188,53 +230,77 @@ struct AgentSegmentFixup // Per-thread fields //--------------------------------------------------------------------- - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedPairsInputIteratorT d_pairs_in; ///< Input keys - AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates - WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values - InequalityWrapper inequality_op; ///< KeyT inequality operator - ReductionOpT reduction_op; ///< Reduction operator - ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator - + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedPairsInputIteratorT d_pairs_in; ///< Input keys + AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates + WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values + InequalityWrapper inequality_op; ///< KeyT inequality operator + ReductionOpT reduction_op; ///< Reduction operator + ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- - // Constructor - __device__ __forceinline__ - AgentSegmentFixup( - TempStorage& temp_storage, ///< Reference to temp_storage - PairsInputIteratorT d_pairs_in, ///< Input keys - AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op) ///< ValueT reduction operator - : - temp_storage(temp_storage.Alias()), - d_pairs_in(d_pairs_in), - d_aggregates_out(d_aggregates_out), - d_fixup_in(d_aggregates_out), - inequality_op(equality_op), - reduction_op(reduction_op), - scan_op(reduction_op) + /** + * @param temp_storage + * Reference to temp_storage + * + * @param d_pairs_in + * Input keys + * + * @param d_aggregates_out + * Output value aggregates + * + * @param equality_op + * KeyT equality operator + * + * @param reduction_op + * ValueT reduction operator + */ + __device__ __forceinline__ AgentSegmentFixup(TempStorage &temp_storage, + PairsInputIteratorT d_pairs_in, + AggregatesOutputIteratorT d_aggregates_out, + EqualityOpT equality_op, + ReductionOpT reduction_op) + : temp_storage(temp_storage.Alias()) + , d_pairs_in(d_pairs_in) + , d_aggregates_out(d_aggregates_out) + , d_fixup_in(d_aggregates_out) + , inequality_op(equality_op) + , reduction_op(reduction_op) + , scan_op(reduction_op) {} - //--------------------------------------------------------------------- // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- /** - * Process input tile. Specialized for atomic-fixup + * @brief Process input tile. Specialized for atomic-fixup + * + * @param num_remaining + * Number of global input items remaining (including this tile) + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @param use_atomic_fixup + * Marker whether to use atomicAdd (instead of reduce-by-key) */ template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, + int tile_idx, + OffsetT tile_offset, + ScanTileStateT &tile_state, + Int2Type use_atomic_fixup) { KeyValuePairT pairs[ITEMS_PER_THREAD]; @@ -264,17 +330,30 @@ struct AgentSegmentFixup atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); } - /** - * Process input tile. Specialized for reduce-by-key fixup + * @brief Process input tile. Specialized for reduce-by-key fixup + * + * @param num_remaining + * Number of global input items remaining (including this tile) + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @param use_atomic_fixup + * Marker whether to use atomicAdd (instead of reduce-by-key) */ template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) + __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining, + int tile_idx, + OffsetT tile_offset, + ScanTileStateT &tile_state, + Int2Type use_atomic_fixup) { KeyValuePairT pairs[ITEMS_PER_THREAD]; KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; @@ -346,19 +425,26 @@ struct AgentSegmentFixup } } - /** - * Scan tiles of items as part of a dynamic chained scan + * @brief Scan tiles of items as part of a dynamic chained scan + * + * @param num_items + * Total number of input items + * + * @param num_tiles + * Total number of input tiles + * + * @param tile_state + * Global tile state descriptor */ - __device__ __forceinline__ void ConsumeRange( - OffsetT num_items, ///< Total number of input items - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ void ConsumeRange(OffsetT num_items, + int num_tiles, + ScanTileStateT &tile_state) { // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) if (num_remaining > TILE_ITEMS) { diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh index 564316b30dd..e5534605716 100644 --- a/cub/cub/agent/agent_select_if.cuh +++ b/cub/cub/agent/agent_select_if.cuh @@ -89,46 +89,77 @@ template > struct AgentSelectIfPolicy { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; + enum + { + /// Threads per thread block + BLOCK_THREADS = _BLOCK_THREADS, - static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + /// Items per thread (per tile of input) + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + }; - struct detail - { - using delay_constructor_t = DelayConstructorT; - }; -}; + /// The BlockLoad algorithm to use + static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + /// Cache load modifier for reading input elements + static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + /// The BlockScan algorithm to use + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; + struct detail + { + using delay_constructor_t = DelayConstructorT; + }; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ - /** - * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection + * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in + * device-wide selection * * Performs functor-based selection if SelectOpT functor type != NullType * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType * Otherwise performs discontinuity selection (keep unique) + * + * @tparam AgentSelectIfPolicyT + * Parameterized AgentSelectIfPolicy tuning policy type + * + * @tparam InputIteratorT + * Random-access input iterator type for selection items + * + * @tparam FlagsInputIteratorT + * Random-access input iterator type for selections (NullType* if a selection functor or + * discontinuity flagging is to be used for selection) + * + * @tparam SelectedOutputIteratorT + * Random-access output iterator type for selection_flags items + * + * @tparam SelectOpT + * Selection operator type (NullType if selections or discontinuity flagging is to be used for + * selection) + * + * @tparam EqualityOpT + * Equality operator type (NullType if selection functor or selections is to be used for + * selection) + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam KEEP_REJECTS + * Whether or not we push rejected items to the back of the output */ -template < - typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for selection items - typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access output iterator type for selection_flags items - typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +template struct AgentSelectIf { //--------------------------------------------------------------------- @@ -215,9 +246,14 @@ struct AgentSelectIf { struct ScanStorage { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + // Smem needed for tile scanning + typename BlockScanT::TempStorage scan; + + // Smem needed for cooperative prefix callback + typename TilePrefixCallbackOpT::TempStorage prefix; + + // Smem needed for discontinuity detection + typename BlockDiscontinuityT::TempStorage discontinuity; } scan_storage; // Smem needed for loading items @@ -238,40 +274,56 @@ struct AgentSelectIf // Per-thread fields //--------------------------------------------------------------------- - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedInputIteratorT d_in; ///< Input items - SelectedOutputIteratorT d_selected_out; ///< Unique output items - WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) - InequalityWrapper inequality_op; ///< T inequality operator - SelectOpT select_op; ///< Selection operator - OffsetT num_items; ///< Total number of input items - + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIteratorT d_in; ///< Input items + SelectedOutputIteratorT d_selected_out; ///< Unique output items + WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) + InequalityWrapper inequality_op; ///< T inequality operator + SelectOpT select_op; ///< Selection operator + OffsetT num_items; ///< Total number of input items //--------------------------------------------------------------------- // Constructor //--------------------------------------------------------------------- - // Constructor - __device__ __forceinline__ - AgentSelectIf( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data - FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< Output data - SelectOpT select_op, ///< Selection operator - EqualityOpT equality_op, ///< Equality operator - OffsetT num_items) ///< Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_selected_out(d_selected_out), - d_flags_in(d_flags_in), - inequality_op(equality_op), - select_op(select_op), - num_items(num_items) + /** + * @param temp_storage + * Reference to temp_storage + * + * @param d_in + * Input data + * + * @param d_flags_in + * Input selection flags (if applicable) + * + * @param d_selected_out + * Output data + * + * @param select_op + * Selection operator + * + * @param equality_op + * Equality operator + * + * @param num_items + * Total number of input items + */ + __device__ __forceinline__ AgentSelectIf(TempStorage &temp_storage, + InputIteratorT d_in, + FlagsInputIteratorT d_flags_in, + SelectedOutputIteratorT d_selected_out, + SelectOpT select_op, + EqualityOpT equality_op, + OffsetT num_items) + : temp_storage(temp_storage.Alias()) + , d_in(d_in) + , d_selected_out(d_selected_out) + , d_flags_in(d_flags_in) + , inequality_op(equality_op) + , select_op(select_op) + , num_items(num_items) {} - //--------------------------------------------------------------------- // Utility methods for initializing the selections //--------------------------------------------------------------------- @@ -401,20 +453,33 @@ struct AgentSelectIf } } - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) + * @brief Scatter flagged items to output offsets (specialized for two-phase scattering) + * + * @param num_tile_items + * Number of valid items in this tile + * + * @param num_tile_selections + * Number of selections in this tile + * + * @param num_selections_prefix + * Total number of selections prior to this tile + * + * @param num_rejected_prefix + * Total number of rejections prior to this tile + * + * @param is_keep_rejects + * Marker type indicating whether to keep rejected items in the second partition */ template - __device__ __forceinline__ void ScatterTwoPhase( - InputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int /*num_tile_items*/, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + __device__ __forceinline__ void ScatterTwoPhase(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, + int num_tile_selections, + OffsetT num_selections_prefix, + OffsetT /*num_rejected_prefix*/, + Int2Type /*is_keep_rejects*/) { CTA_SYNC(); @@ -437,20 +502,33 @@ struct AgentSelectIf } } - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) + * @brief Scatter flagged items to output offsets (specialized for two-phase scattering) + * + * @param num_tile_items + * Number of valid items in this tile + * + * @param num_tile_selections + * Number of selections in this tile + * + * @param num_selections_prefix + * Total number of selections prior to this tile + * + * @param num_rejected_prefix + * Total number of rejections prior to this tile + * + * @param is_keep_rejects + * Marker type indicating whether to keep rejected items in the second partition */ template - __device__ __forceinline__ void ScatterTwoPhase( - InputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition + __device__ __forceinline__ void ScatterTwoPhase(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, + int num_tile_selections, + OffsetT num_selections_prefix, + OffsetT num_rejected_prefix, + Int2Type /*is_keep_rejects*/) { CTA_SYNC(); @@ -492,20 +570,33 @@ struct AgentSelectIf } } - /** - * Scatter flagged items + * @brief Scatter flagged items + * + * @param num_tile_items + * Number of valid items in this tile + * + * @param num_tile_selections + * Number of selections in this tile + * + * @param num_selections_prefix + * Total number of selections prior to this tile + * + * @param num_rejected_prefix + * Total number of rejections prior to this tile + * + * @param num_selections + * Total number of selections including this tile */ template - __device__ __forceinline__ void Scatter( - InputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - OffsetT num_selections) ///< Total number of selections including this tile + __device__ __forceinline__ void Scatter(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int num_tile_items, + int num_tile_selections, + OffsetT num_selections_prefix, + OffsetT num_rejected_prefix, + OffsetT num_selections) { // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) @@ -536,13 +627,23 @@ struct AgentSelectIf /** - * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + * @brief Process first tile of input (dynamic chained scan). + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @return The running count of selections (including this tile) */ template - __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT ConsumeFirstTile(int num_tile_items, + OffsetT tile_offset, + ScanTileStateT &tile_state) { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; @@ -593,16 +694,28 @@ struct AgentSelectIf return num_tile_selections; } - /** - * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + * @brief Process subsequent tile of input (dynamic chained scan). + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @return The running count of selections (including this tile) */ template - __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT ConsumeSubsequentTile(int num_tile_items, + int tile_idx, + OffsetT tile_offset, + ScanTileStateT &tile_state) { InputT items[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; @@ -657,14 +770,23 @@ struct AgentSelectIf /** - * Process a tile of input + * @brief Process a tile of input + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor */ template - __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT + ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) { OffsetT num_selections; if (tile_idx == 0) @@ -679,19 +801,29 @@ struct AgentSelectIf return num_selections; } - /** - * Scan tiles of items as part of a dynamic chained scan + * @brief Scan tiles of items as part of a dynamic chained scan + * + * @param num_tiles + * Total number of input tiles + * + * @param tile_state + * Global tile state descriptor + * + * @param d_num_selected_out + * Output total number selection_flags + * + * @tparam NumSelectedIteratorT + * Output iterator type for recording number of items selection_flags */ - template ///< Output iterator type for recording number of items selection_flags - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state, ///< Global tile state descriptor - NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + template + __device__ __forceinline__ void ConsumeRange(int num_tiles, + ScanTileStateT &tile_state, + NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile if (tile_idx < num_tiles - 1) { diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh index 4d9ce6ce582..4db94ed6693 100644 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ b/cub/cub/agent/agent_spmv_orig.cuh @@ -60,69 +60,155 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * Parameterizable tuning policy type for AgentSpmv + * @param Parameterizable tuning policy type for AgentSpmv + * + * @tparam _BLOCK_THREADS + * Threads per thread block + * + * @tparam _ITEMS_PER_THREAD + * Items per thread (per tile of input) + * + * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER + * Cache load modifier for reading CSR row-offsets during search + * + * @tparam _ROW_OFFSETS_LOAD_MODIFIER + * Cache load modifier for reading CSR row-offsets + * + * @tparam _COLUMN_INDICES_LOAD_MODIFIER + * Cache load modifier for reading CSR column-indices + * + * @tparam _VALUES_LOAD_MODIFIER + * Cache load modifier for reading CSR values + * + * @tparam _VECTOR_VALUES_LOAD_MODIFIER + * Cache load modifier for reading vector values + * + * @tparam _DIRECT_LOAD_NONZEROS + * Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through + * shared memory) + * + * @tparam _SCAN_ALGORITHM + * The BlockScan algorithm to use */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search - CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets - CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices - CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values - CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values - bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +template struct AgentSpmvPolicy { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) - }; + enum + { + /// Threads per thread block + BLOCK_THREADS = _BLOCK_THREADS, - static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices - static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values - static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values - static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use + /// Items per thread (per tile of input) + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, -}; + /// Whether to load nonzeros directly from global during sequential merging (pre-staged through + /// shared memory) + DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, + }; + + /// Cache load modifier for reading CSR row-offsets + static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = + _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; + + /// Cache load modifier for reading CSR row-offsets + static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; + /// Cache load modifier for reading CSR column-indices + static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; + + /// Cache load modifier for reading CSR values + static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; + + /// Cache load modifier for reading vector values + static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; + + /// The BlockScan algorithm to use + static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; +}; /****************************************************************************** * Thread block abstractions ******************************************************************************/ -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets +/** + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + */ +template struct SpmvParams { - const ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - const OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - const OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - const ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows; ///< Number of rows of matrix A. - int num_cols; ///< Number of columns of matrix A. - int num_nonzeros; ///< Number of nonzero elements of matrix A. - ValueT alpha; ///< Alpha multiplicand - ValueT beta; ///< Beta addend-multiplicand -}; + /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix + /// A. + const ValueT *d_values; + + /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices + /// and \p d_values + const OffsetT *d_row_end_offsets; + + /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements + /// of matrix A. (Indices are zero-valued.) + const OffsetT *d_column_indices; + + /// Pointer to the array of \p num_cols values corresponding to the dense input vector x + const ValueT *d_vector_x; + + /// Pointer to the array of \p num_rows values corresponding to the dense output vector y + ValueT *d_vector_y; + + /// Number of rows of matrix A. + int num_rows; + + /// Number of columns of matrix A. + int num_cols; + + /// Number of nonzero elements of matrix A. + int num_nonzeros; + /// Alpha multiplicand + ValueT alpha; + + /// Beta addend-multiplicand + ValueT beta; +}; /** - * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. + * + * @tparam AgentSpmvPolicyT + * Parameterized AgentSpmvPolicy tuning policy type + * + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + * + * @tparam HAS_ALPHA + * Whether the input parameter \p alpha is 1 + * + * @tparam HAS_BETA + * Whether the input parameter \p beta is 0 + * + * @tparam LEGACY_PTX_ARCH + * PTX compute capability (unused) */ -template < - typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 - bool HAS_BETA, ///< Whether the input parameter \p beta is 0 - int LEGACY_PTX_ARCH = 0> ///< PTX compute capability (unused) +template struct AgentSpmv { //--------------------------------------------------------------------- @@ -252,49 +338,66 @@ struct AgentSpmv // Per-thread fields //--------------------------------------------------------------------- + /// Reference to temp_storage + _TempStorage &temp_storage; + + SpmvParams &spmv_params; + + /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements + /// of matrix A. + ValueIteratorT wd_values; - _TempStorage& temp_storage; /// Reference to temp_storage + /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p + /// d_column_indices and \p d_values + RowOffsetsIteratorT wd_row_end_offsets; - SpmvParams& spmv_params; + /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero + /// elements of matrix A. (Indices are zero-valued.) + ColumnIndicesIteratorT wd_column_indices; - ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x - VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x + /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector + /// x + VectorValueIteratorT wd_vector_x; + /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector + /// x + VectorValueIteratorT wd_vector_y; //--------------------------------------------------------------------- // Interface //--------------------------------------------------------------------- /** - * Constructor + * @param temp_storage + * Reference to temp_storage + * + * @param spmv_params + * SpMV input parameter bundle */ - __device__ __forceinline__ AgentSpmv( - TempStorage& temp_storage, ///< Reference to temp_storage - SpmvParams& spmv_params) ///< SpMV input parameter bundle - : - temp_storage(temp_storage.Alias()), - spmv_params(spmv_params), - wd_values(spmv_params.d_values), - wd_row_end_offsets(spmv_params.d_row_end_offsets), - wd_column_indices(spmv_params.d_column_indices), - wd_vector_x(spmv_params.d_vector_x), - wd_vector_y(spmv_params.d_vector_y) + __device__ __forceinline__ AgentSpmv(TempStorage &temp_storage, + SpmvParams &spmv_params) + : temp_storage(temp_storage.Alias()) + , spmv_params(spmv_params) + , wd_values(spmv_params.d_values) + , wd_row_end_offsets(spmv_params.d_row_end_offsets) + , wd_column_indices(spmv_params.d_column_indices) + , wd_vector_x(spmv_params.d_vector_x) + , wd_vector_y(spmv_params.d_vector_y) {} /** - * Consume a merge tile, specialized for direct-load of nonzeros + * @brief Consume a merge tile, specialized for direct-load of nonzeros + * + * @param is_direct_load + * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + __device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; @@ -413,13 +516,15 @@ struct AgentSpmv /** - * Consume a merge tile, specialized for indirect load of nonzeros + * @brief Consume a merge tile, specialized for indirect load of nonzeros + * + * @param is_direct_load + * Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch + __device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx, + CoordinateT tile_start_coord, + CoordinateT tile_end_coord, + Int2Type is_direct_load) { int tile_num_rows = tile_end_coord.x - tile_start_coord.x; int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; @@ -601,12 +706,20 @@ struct AgentSpmv /** - * Consume input tile + * @brief Consume input tile + * + * @param[in] d_tile_coordinates + * Pointer to the temporary array of tile starting coordinates + * + * @param[out] d_tile_carry_pairs + * Pointer to the temporary array carry-out dot product row-ids, one per block + * + * @param[in] num_merge_tiles + * Number of merge tiles */ - __device__ __forceinline__ void ConsumeTile( - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_merge_tiles) ///< [in] Number of merge tiles + __device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates, + KeyValuePairT *d_tile_carry_pairs, + int num_merge_tiles) { int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh index a7b9e0a367b..0124759e192 100644 --- a/cub/cub/agent/agent_unique_by_key.cuh +++ b/cub/cub/agent/agent_unique_by_key.cuh @@ -26,7 +26,7 @@ ******************************************************************************/ /** - * \file + * @file * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key. */ @@ -93,18 +93,38 @@ struct AgentUniqueByKeyPolicy * Thread block abstractions ******************************************************************************/ - /** - * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key + * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating + * in device-wide unique-by-key + * + * @tparam AgentUniqueByKeyPolicyT + * Parameterized AgentUniqueByKeyPolicy tuning policy type + * + * @tparam KeyInputIteratorT + * Random-access input iterator type for keys + * + * @tparam ValueInputIteratorT + * Random-access input iterator type for values + * + * @tparam KeyOutputIteratorT + * Random-access output iterator type for keys + * + * @tparam ValueOutputIteratorT + * Random-access output iterator type for values + * + * @tparam EqualityOpT + * Equality operator type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type - typename KeyInputIteratorT, ///< Random-access input iterator type for keys - typename ValueInputIteratorT, ///< Random-access input iterator type for values - typename KeyOutputIteratorT, ///< Random-access output iterator type for keys - typename ValueOutputIteratorT, ///< Random-access output iterator type for values - typename EqualityOpT, ///< Equality operator type - typename OffsetT> ///< Signed integer type for global offsets +template struct AgentUniqueByKey { //--------------------------------------------------------------------- @@ -295,15 +315,24 @@ struct AgentUniqueByKey // Cooperatively scan a device-wide sequence of tiles with other CTAs //--------------------------------------------------------------------- - /** - * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + * @brief Process first tile of input (dynamic chained scan). + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @return The running count of selections (including this tile) */ template - __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT ConsumeFirstTile(int num_tile_items, + OffsetT tile_offset, + ScanTileStateT &tile_state) { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; @@ -411,14 +440,27 @@ struct AgentUniqueByKey } /** - * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + * @brief Process subsequent tile of input (dynamic chained scan). + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor + * + * @return Returns the running count of selections (including this tile) */ template - __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT ConsumeSubsequentTile(int num_tile_items, + int tile_idx, + OffsetT tile_offset, + ScanTileStateT &tile_state) { KeyT keys[ITEMS_PER_THREAD]; OffsetT selection_flags[ITEMS_PER_THREAD]; @@ -527,16 +569,24 @@ struct AgentUniqueByKey return num_selections; } - /** - * Process a tile of input + * @brief Process a tile of input + * + * @param num_tile_items + * Number of input items comprising this tile + * + * @param tile_idx + * Tile index + * + * @param tile_offset + * Tile offset + * + * @param tile_state + * Global tile state descriptor */ template - __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor + __device__ __forceinline__ OffsetT + ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state) { OffsetT num_selections; if (tile_idx == 0) @@ -552,13 +602,25 @@ struct AgentUniqueByKey } /** - * Scan tiles of items as part of a dynamic chained scan + * @brief Scan tiles of items as part of a dynamic chained scan + * + * @param num_tiles + * Total number of input tiles + * + * @param tile_state + * Global tile state descriptor + * + * @param d_num_selected_out + * Output total number selection_flags + * + * @tparam NumSelectedIteratorT + * Output iterator type for recording number of items selection_flags + * */ - template ///< Output iterator type for recording number of items selection_flags - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state, ///< Global tile state descriptor - NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + template + __device__ __forceinline__ void ConsumeRange(int num_tiles, + ScanTileStateT &tile_state, + NumSelectedIteratorT d_num_selected_out) { // Blocks are launched in increasing order, so just assign one tile per block int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index ff8a4e18347..26481b48356 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -65,43 +65,47 @@ CUB_NAMESPACE_BEGIN * Stateful callback operator type for supplying BlockScan prefixes. * Maintains a running prefix that can be applied to consecutive * BlockScan operations. + * + * @tparam T + * BlockScan value type + * + * @tparam ScanOpT + * Wrapped scan operator type */ -template < - typename T, ///< BlockScan value type - typename ScanOpT> ///< Wrapped scan operator type +template struct BlockScanRunningPrefixOp { - ScanOpT op; ///< Wrapped scan operator - T running_total; ///< Running block-wide prefix - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) - : - op(op) - {} - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp( - T starting_prefix, - ScanOpT op) - : - op(op), - running_total(starting_prefix) - {} - - /** - * Prefix callback operator. Returns the block-wide running_total in thread-0. - */ - __device__ __forceinline__ T operator()( - const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs - { - T retval = running_total; - running_total = op(running_total, block_aggregate); - return retval; - } + /// Wrapped scan operator + ScanOpT op; + + /// Running block-wide prefix + T running_total; + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) + : op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op) + : op(op) + , running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + * + * @param block_aggregate + * The aggregate sum of the BlockScan inputs + */ + __device__ __forceinline__ T operator()(const T &block_aggregate) + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } }; - /****************************************************************************** * Generic tile status interface types for block-cooperative scans ******************************************************************************/ @@ -534,32 +538,45 @@ struct ScanTileState d_tile_descriptors(NULL) {} - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + /** + * @brief Initializer + * + * @param[in] num_tiles + * Number of tiles + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to \p temp_storage_bytes and no work is + * done. + * + * @param[in] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation + */ + __host__ __device__ __forceinline__ cudaError_t Init(int /*num_tiles*/, + void *d_temp_storage, + size_t /*temp_storage_bytes*/) { - d_tile_descriptors = reinterpret_cast(d_temp_storage); + d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } - /** - * Compute device memory needed for tile status + * @brief Compute device memory needed for tile status + * + * @param[in] num_tiles + * Number of tiles + * + * @param[out] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + __host__ __device__ __forceinline__ static cudaError_t + AllocationSize(int num_tiles, size_t &temp_storage_bytes) { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); // bytes needed for tile status descriptors + // bytes needed for tile status descriptors + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); return cudaSuccess; } - /** * Initialize (from device) */ @@ -689,13 +706,24 @@ struct ScanTileState d_tile_inclusive(NULL) {} - + /** + * @brief Initializer + * + * @param[in] num_tiles + * Number of tiles + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to \p temp_storage_bytes and no work is + * done. + * + * @param[in] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation + */ /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int num_tiles, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + __host__ __device__ __forceinline__ cudaError_t Init(int num_tiles, + void *d_temp_storage, + size_t temp_storage_bytes) { cudaError_t error = cudaSuccess; do @@ -703,9 +731,14 @@ struct ScanTileState void* allocations[3] = {}; size_t allocation_sizes[3]; - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + // bytes needed for tile status descriptors + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); + + // bytes needed for partials + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); + + // bytes needed for inclusives + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // Compute allocation pointers into the single storage blob error = CubDebug( @@ -726,20 +759,29 @@ struct ScanTileState return error; } - /** - * Compute device memory needed for tile status + * @brief Compute device memory needed for tile status + * + * @param[in] num_tiles + * Number of tiles + * + * @param[out] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + __host__ __device__ __forceinline__ static cudaError_t + AllocationSize(int num_tiles, size_t &temp_storage_bytes) { // Specify storage allocation requirements size_t allocation_sizes[3]; - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // bytes needed for tile status descriptors + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); + + // bytes needed for partials + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); + + // bytes needed for inclusives + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // Set the necessary size of the blob void* allocations[3] = {}; @@ -928,32 +970,44 @@ struct ReduceByKeyScanTileState d_tile_descriptors(NULL) {} - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation + /** + * @brief Initializer + * + * @param[in] num_tiles + * Number of tiles + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When NULL, the required allocation size + * is written to \p temp_storage_bytes and no work is done. + * + * @param[in] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation + */ + __host__ __device__ __forceinline__ cudaError_t Init(int /*num_tiles*/, + void *d_temp_storage, + size_t /*temp_storage_bytes*/) { - d_tile_descriptors = reinterpret_cast(d_temp_storage); + d_tile_descriptors = reinterpret_cast(d_temp_storage); return cudaSuccess; } - /** - * Compute device memory needed for tile status + * @brief Compute device memory needed for tile status + * + * @param[in] num_tiles + * Number of tiles + * + * @param[out] temp_storage_bytes + * Size in bytes of \t d_temp_storage allocation */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + __host__ __device__ __forceinline__ static cudaError_t + AllocationSize(int num_tiles, size_t &temp_storage_bytes) { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); // bytes needed for tile status descriptors + // bytes needed for tile status descriptors + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord); return cudaSuccess; } - /** * Initialize (from device) */ @@ -1096,12 +1150,12 @@ struct TilePrefixCallbackOp typedef typename ScanTileStateT::StatusWord StatusWord; // Fields - _TempStorage& temp_storage; ///< Reference to a warp-reduction instance - ScanTileStateT& tile_status; ///< Interface to tile status - ScanOpT scan_op; ///< Binary scan operator - int tile_idx; ///< The current tile index - T exclusive_prefix; ///< Exclusive prefix for the tile - T inclusive_prefix; ///< Inclusive prefix for the tile + _TempStorage &temp_storage; ///< Reference to a warp-reduction instance + ScanTileStateT &tile_status; ///< Interface to tile status + ScanOpT scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile // Constructs prefix functor for a given tile index. // Precondition: thread blocks processing all of the predecessor tiles were scheduled. @@ -1123,14 +1177,23 @@ struct TilePrefixCallbackOp : TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x) {} - // Block until all predecessors within the warp-wide window have non-invalid status + /** + * @brief Block until all predecessors within the warp-wide window have non-invalid status + * + * @param predecessor_idx + * Preceding tile index to inspect + * + * @param[out] predecessor_status + * Preceding tile status + * + * @param[out] window_aggregate + * Relevant partial reduction from this window of preceding tiles + */ template > - __device__ __forceinline__ - void ProcessWindow( - int predecessor_idx, ///< Preceding tile index to inspect - StatusWord &predecessor_status, ///< [out] Preceding tile status - T &window_aggregate, ///< [out] Relevant partial reduction from this window of preceding tiles - DelayT delay = {}) + __device__ __forceinline__ void ProcessWindow(int predecessor_idx, + StatusWord &predecessor_status, + T &window_aggregate, + DelayT delay = {}) { T value; tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay); diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh index b24b4bf5c5d..9dd1e096c1a 100644 --- a/cub/cub/block/block_adjacent_difference.cuh +++ b/cub/cub/block/block_adjacent_difference.cuh @@ -1069,18 +1069,26 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * + * @param[out] output + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[out] preds + * Calling thread's predecessor items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagHeads( - FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + T (&preds)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; @@ -1103,18 +1111,31 @@ public: } /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * + * @param[out] output + * Calling thread's discontinuity result + * + * @param[in] input + * Calling thread's input items + * + * @param[out] preds + * Calling thread's predecessor items + * + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). */ - template - CUB_DEPRECATED __device__ __forceinline__ void FlagHeads( - FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + template + CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + T (&preds)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; @@ -1135,51 +1156,71 @@ public: #endif // DOXYGEN_SHOULD_SKIP_THIS /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * + * @param[out] output + * Calling thread's discontinuity result + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template - CUB_DEPRECATED __device__ __forceinline__ void - FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op); } /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead. + * + * @param[out] output + * Calling thread's discontinuity result + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). */ - template - CUB_DEPRECATED __device__ __forceinline__ void - FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + template + CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_predecessor_item) { T preds[ITEMS_PER_THREAD]; FlagHeads(output, input, preds, flag_op, tile_predecessor_item); } - /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param output + * [out] Calling thread's discontinuity result + * + * @param input + * [in] Calling thread's input items + * + * @param flag_op + * [in] Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagTails( - FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void FlagTails(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; @@ -1199,20 +1240,29 @@ public: Iterate::FlagTails(linear_tid, output, input, flag_op); } - /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param[out] output + * Calling thread's discontinuity result + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_successor_item + * [threadBLOCK_THREADS-1 only] Item with which to compare + * the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagTails( - FlagT (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity result - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + template + CUB_DEPRECATED __device__ __forceinline__ void FlagTails(FlagT (&output)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; @@ -1234,21 +1284,29 @@ public: Iterate::FlagTails(linear_tid, output, input, flag_op); } - /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or - * cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or + * cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void + FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -1290,22 +1348,35 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or - * cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or + * cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] tile_successor_item + * [threadBLOCK_THREADS-1 only] Item with which to compare + * the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void + FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T tile_successor_item, + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -1349,20 +1420,33 @@ public: } /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or - * cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or + * cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void + FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + T tile_predecessor_item, + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -1399,23 +1483,40 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails - * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or - * cub::BlockAdjacentDifference::SubtractRight instead. + * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails + * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or + * cub::BlockAdjacentDifference::SubtractRight instead. + * + * @param head_flags + * [out] Calling thread's discontinuity head_flags + * + * @param tile_predecessor_item + * [in] [thread0 only] Item with which to compare the first tile + * item (input0 from thread0). + * + * @param tail_flags + * [out] Calling thread's discontinuity tail_flags + * + * @param tile_successor_item + * [in] [threadBLOCK_THREADS-1 only] Item with which to + * compare the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). + * + * @param input + * [in] Calling thread's input items + * + * @param flag_op + * [in] Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + CUB_DEPRECATED __device__ __forceinline__ void + FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + T tile_predecessor_item, + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T tile_successor_item, + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh index b94e125e93c..af09e6d99c5 100644 --- a/cub/cub/block/block_discontinuity.cuh +++ b/cub/cub/block/block_discontinuity.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + * @file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for + * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. */ #pragma once @@ -47,32 +48,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) - * \ingroup BlockModule + * @brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for + * flagging discontinuities within an ordered set of items partitioned across a CUDA thread + * block. ![](discont_logo.png) * - * \tparam T The data type to be flagged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @ingroup BlockModule * - * \par Overview + * @tparam T + * The data type to be flagged. + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items * that differ from their predecessors (or successors). For example, head flags are convenient * for demarcating disjoint data segments as part of a segmented scan or reduction. * - \blocked * - * \par Performance Considerations + * @par Performance Considerations * - \granularity * - * \par A Simple Example + * @par A Simple Example * \blockcollective{BlockDiscontinuity} - * \par + * @par * The code snippet below illustrates the head flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -91,21 +104,22 @@ CUB_NAMESPACE_BEGIN * int head_flags[4]; * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); * - * \endcode - * \par + * @endcode + * @par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. * The corresponding output \p head_flags in those threads will be * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * - * \par Performance Considerations + * @par Performance Considerations * - Incurs zero bank conflicts for most types * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: - * example_block_reduce_dyn_smem.cu + * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockDiscontinuity. */ @@ -176,17 +190,27 @@ private: /// Templated unrolling of item comparison (inductive case) struct Iterate { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + /** + * @brief Head flags + * + * @param[out] flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[out] preds + * Calling thread's predecessor items + * + * @param[in] flag_op + * Binary boolean flag predicate + */ + template + static __device__ __forceinline__ void FlagHeads(int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + T (&preds)[ITEMS_PER_THREAD], + FlagOp flag_op) { #pragma unroll for (int i = 1; i < ITEMS_PER_THREAD; ++i) { @@ -199,16 +223,23 @@ private: } } - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + /** + * @brief Tail flags + * + * @param[out] flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate + */ + template + static __device__ __forceinline__ void FlagTails(int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { #pragma unroll for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) { @@ -235,17 +266,18 @@ private: public: - /// \smemstorage{BlockDiscontinuity} + /// @smemstorage{BlockDiscontinuity} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary + * storage. */ __device__ __forceinline__ BlockDiscontinuity() : @@ -253,18 +285,17 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockDiscontinuity( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockDiscontinuity(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - //@} end member group /******************************************************************//** * \name Head flag operations @@ -274,15 +305,24 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + /** + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[out] preds + * Calling thread's predecessor items + * + * @param[in] flag_op + * Binary boolean flag predicate + */ + template + __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + T (&preds)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; @@ -304,16 +344,29 @@ public: Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op); } - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + /** + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[out] preds + * Calling thread's predecessor items + * + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). + */ + template + __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + T (&preds)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_predecessor_item) { // Share last item temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; @@ -333,11 +386,11 @@ public: #endif // DOXYGEN_SHOULD_SKIP_THIS - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. + * @brief Sets head flags indicating discontinuities between items partitioned across the thread + * block, for which the first item has no reference and is always flagged. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) @@ -348,12 +401,12 @@ public: * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -372,35 +425,49 @@ public: * int head_flags[4]; * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); * - * \endcode - * \par + * @endcode + * @par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. * The corresponding output \p head_flags in those threads will be * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true + * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank + * of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op); } - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets head flags indicating discontinuities between items partitioned across the thread + * block. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) @@ -412,12 +479,12 @@ public: * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -441,26 +508,44 @@ public: * BlockDiscontinuity(temp_storage).FlagHeads( * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); * - * \endcode - * \par + * @endcode + * @par * Suppose the set of input \p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, - * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be - * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those + * threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), + * and returning \p true if a discontinuity exists between \p a and \p b, + * otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] input + * Calling thread's input items * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + template + __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_predecessor_item) { T preds[ITEMS_PER_THREAD]; FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); @@ -470,15 +555,15 @@ public: //@} end member group /******************************************************************//** - * \name Tail flag operations + * @name Tail flag operations *********************************************************************/ //@{ - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. + * @brief Sets tail flags indicating discontinuities between items partitioned across the thread + * block, for which the last item has no reference and is always flagged. * - * \par + * @par * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) @@ -486,16 +571,16 @@ public: * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -514,25 +599,39 @@ public: * int tail_flags[4]; * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. - * The corresponding output \p tail_flags in those threads will be + * The corresponding output @p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true + * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the + * rank of b in the aggregate tile of data. + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first item temp_storage.first_items[linear_tid] = input[0]; @@ -552,29 +651,29 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets tail flags indicating discontinuities between items partitioned across the thread + * block. * - * \par + * @par * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item + * returns @p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. + * against @p tile_successor_item. * - \blocked * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -598,26 +697,45 @@ public: * BlockDiscontinuity(temp_storage).FlagTails( * tail_flags, thread_data, cub::Inequality(), tile_successor_item); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * and that @p tile_successor_item is @p 125. The corresponding output @p tail_flags in those + * threads will be { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true + * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the + * rank of b in the aggregate tile of data. + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate + * + * @param[in] tile_successor_item + * [threadBLOCK_THREADS-1 only] Item with which to + * compare the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + template + __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op, + T tile_successor_item) { // Share first item temp_storage.first_items[linear_tid] = input[0]; @@ -642,25 +760,25 @@ public: //@} end member group /******************************************************************//** - * \name Head & tail flag operations + * @name Head & tail flag operations *********************************************************************/ //@{ - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets both head and tail flags indicating discontinuities between items partitioned + * across the thread block. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item + * returns @p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is always flagged. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item + * returns @p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. @@ -668,12 +786,12 @@ public: * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -694,28 +812,45 @@ public: * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tail_flags, thread_data, cub::Inequality()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * and that the tile_successor_item is @p 125. The corresponding output @p head_flags * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be + * and the corresponding output @p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true + * if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the + * rank of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -757,35 +892,35 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets both head and tail flags indicating discontinuities between items partitioned + * across the thread block. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item + * returns @p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is always flagged. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item + * returns @p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared - * against \p tile_predecessor_item. + * against @p tile_predecessor_item. * - \blocked * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -810,29 +945,51 @@ public: * BlockDiscontinuity(temp_storage).FlagTails( * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags + * and that the tile_successor_item is @p 125. The corresponding output @p head_flags * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be + * and the corresponding output @p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true + * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the + * rank of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] tile_successor_item + * [threadBLOCK_THREADS-1 only] Item with which to compare + * the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T tile_successor_item, + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -875,22 +1032,22 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets both head and tail flags indicating discontinuities between items partitioned + * across the thread block. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item + * returns @p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is compared - * against \p tile_predecessor_item. + * against @p tile_predecessor_item. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item + * returns @p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is always flagged. @@ -898,12 +1055,12 @@ public: * - \granularity * - \smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -933,30 +1090,51 @@ public: * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, * thread_data, cub::Inequality()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * that the @p tile_predecessor_item is @p 0, and that the + * @p tile_successor_item is @p 125. The corresponding output @p head_flags * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be + * and the corresponding output @p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true + * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank + * of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + T tile_predecessor_item, + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; @@ -993,36 +1171,36 @@ public: Iterate::FlagTails(linear_tid, tail_flags, input, flag_op); } - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. + * @brief Sets both head and tail flags indicating discontinuities between items partitioned + * across the thread block. * - * \par + * @par * - The flag head_flagsi is set for item * inputi when * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item + * returns @p true (where previous-item is either the preceding item * in the same thread or the last item in the previous thread). * - For thread0, item input0 is compared - * against \p tile_predecessor_item. + * against @p tile_predecessor_item. * - The flag tail_flagsi is set for item * inputi when * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item + * returns @p true (where next-item is either the next item * in the same thread or the first item in the next thread). * - For threadBLOCK_THREADS-1, item * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. - * - \blocked - * - \granularity - * - \smemreuse + * against @p tile_successor_item. + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the head- and tail-flagging of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1052,31 +1230,57 @@ public: * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, * thread_data, cub::Inequality()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags + * that the @p tile_predecessor_item is @p 0, and that the + * @p tile_successor_item is @p 125. The corresponding output @p head_flags * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be + * and the corresponding output @p tail_flags in those threads will be * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam FlagT + * [inferred] The flag type (must be an integer type) + * + * @tparam FlagOp + * [inferred] Binary predicate functor type having member + * T operator()(const T &a, const T &b) or member + * T operator()(const T &a, const T &b, unsigned int b_index), and returning @p true + * if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank + * of b in the aggregate tile of data. + * + * @param[out] head_flags + * Calling thread's discontinuity head_flags + * + * @param[in] tile_predecessor_item + * [thread0 only] Item with which to compare the first tile item + * (input0 from thread0). + * + * @param[out] tail_flags + * Calling thread's discontinuity tail_flags + * + * @param[in] tile_successor_item + * [threadBLOCK_THREADS-1 only] Item with which to compare + * the last tile item (inputITEMS_PER_THREAD-1 from + * threadBLOCK_THREADS-1). + * + * @param[in] input + * Calling thread's input items + * + * @param[in] flag_op + * Binary boolean flag predicate */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate + template + __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD], + T tile_predecessor_item, + FlagT (&tail_flags)[ITEMS_PER_THREAD], + T tile_successor_item, + T (&input)[ITEMS_PER_THREAD], + FlagOp flag_op) { // Share first and last items temp_storage.first_items[linear_tid] = input[0]; diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh index d6a7cfbbbe8..259027dbc61 100644 --- a/cub/cub/block/block_exchange.cuh +++ b/cub/cub/block/block_exchange.cuh @@ -202,15 +202,20 @@ private: return private_storage; } - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. + * @brief Transposes data items from blocked arrangement to striped + * arrangement. Specialized for no timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -232,15 +237,20 @@ private: } } - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. + * @brief Transposes data items from blocked arrangement to striped + * arrangement. Specialized for warp-timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; @@ -293,15 +303,20 @@ private: } } - /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing + * @brief Transposes data items from blocked arrangement to warp-striped + * arrangement. Specialized for no timeslicing + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -324,13 +339,19 @@ private: } /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing + * @brief Transposes data items from blocked arrangement to warp-striped + * arrangement. Specialized for warp-timeslicing + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { if (warp_id == 0) { @@ -383,15 +404,20 @@ private: } } - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. + * @brief Transposes data items from striped arrangement to blocked + * arrangement. Specialized for no timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -414,15 +440,20 @@ private: } } - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. + * @brief Transposes data items from striped arrangement to blocked + * arrangement. Specialized for warp-timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { // Warp time-slicing InputT temp_items[ITEMS_PER_THREAD]; @@ -477,15 +508,20 @@ private: } } - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing + * @brief Transposes data items from warp-striped arrangement to blocked + * arrangement. Specialized for no timeslicing + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -508,15 +544,20 @@ private: } } - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing + * @brief Transposes data items from warp-striped arrangement to blocked + * arrangement. Specialized for warp-timeslicing + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. */ template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) + __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) @@ -547,16 +588,24 @@ private: } } - /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. + * @brief Exchanges data items annotated by rank into blocked arrangement. Specialized + * for no timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) + __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -579,14 +628,23 @@ private: } /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. + * @brief Exchanges data items annotated by rank into blocked arrangement. Specialized + * for warp-timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) + __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT ranks[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; @@ -631,16 +689,24 @@ private: } } - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. + * @brief Exchanges data items annotated by rank into striped arrangement. Specialized + * for no timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) + __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -662,16 +728,24 @@ private: } } - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. + * @brief Exchanges data items annotated by rank into striped arrangement. Specialized + * for warp-timeslicing. + * + * @param[in] input_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[out] output_items + * Items to exchange, converting between blocked and striped arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) + __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD], + Int2Type /*time_slicing*/) { InputT temp_items[ITEMS_PER_THREAD]; @@ -728,12 +802,12 @@ private: public: /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockExchange() : @@ -744,38 +818,39 @@ public: warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockExchange( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - lane_id(LaneId()), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + __device__ __forceinline__ BlockExchange(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + , lane_id(LaneId()) + , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS) + , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) {} //@} end member group /******************************************************************//** - * \name Structured exchanges + * @name Structured exchanges *********************************************************************/ //@{ /** - * \brief Transposes data items from striped arrangement to blocked arrangement. + * @brief Transposes data items from striped arrangement to blocked + * arrangement. * - * \par - * - \smemreuse + * @par + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -793,34 +868,38 @@ public: * // Collectively exchange data into a blocked arrangement across threads * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of striped input \p thread_data across the block of threads is - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. - * The corresponding output \p thread_data in those threads will be + * @endcode + * @par + * Suppose the set of striped input @p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from + * device-accessible memory. The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. */ template - __device__ __forceinline__ void StripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD]) { StripedToBlocked(input_items, output_items, Int2Type()); } - /** - * \brief Transposes data items from blocked arrangement to striped arrangement. + * @brief Transposes data items from blocked arrangement to striped + * arrangement. * - * \par - * - \smemreuse + * @par + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -841,36 +920,40 @@ public: * // Store data striped across block threads into an ordered tile * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of blocked input @p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in * preparation for storing to device-accessible memory. * + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. */ template - __device__ __forceinline__ void BlockedToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToStriped(input_items, output_items, Int2Type()); } - - /** - * \brief Transposes data items from warp-striped arrangement to blocked arrangement. + * @brief Transposes data items from warp-striped arrangement to blocked + * arrangement. * - * \par - * - \smemreuse + * @par + * - @smemreuse * - * \par Snippet - * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code + * @par Snippet + * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" + * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 + * items. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -888,37 +971,41 @@ public: * // Collectively exchange data into a blocked arrangement across threads * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); * - * \endcode - * \par - * Suppose the set of warp-striped input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of warp-striped input @p thread_data across the block of threads is * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } * after loading from device-accessible memory. (The first 128 items are striped across * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. */ template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(input_items, output_items, Int2Type()); } - - /** - * \brief Transposes data items from blocked arrangement to warp-striped arrangement. + * @brief Transposes data items from blocked arrangement to warp-striped + * arrangement. * - * \par - * - \smemreuse + * @par + * - @smemreuse * - * \par Snippet - * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code + * @par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" + * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4 + * items. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -939,20 +1026,25 @@ public: * // Store data striped across warp threads into an ordered tile * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of blocked input @p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } - * in preparation for storing to device-accessible memory. (The first 128 items are striped across - * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * in preparation for storing to device-accessible memory. (The first 128 items are striped + * across the first warp of 32 threads, the second 128 items are striped across the second warp, + * etc.) + * + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. */ template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. + __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(input_items, output_items, Int2Type()); } @@ -961,62 +1053,86 @@ public: //@} end member group /******************************************************************//** - * \name Scatter exchanges + * @name Scatter exchanges *********************************************************************/ //@{ - /** - * \brief Exchanges data items annotated by rank into blocked arrangement. + * @brief Exchanges data items annotated by rank into blocked arrangement. * - * \par - * - \smemreuse + * @par + * - @smemreuse * - * \tparam OffsetT [inferred] Signed integer type for local offsets + * @tparam OffsetT + * [inferred] Signed integer type for local offsets + * + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToBlocked( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(input_items, output_items, ranks, Int2Type()); } - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. + * @brief Exchanges data items annotated by rank into striped arrangement. + * + * @par + * - @smemreuse * - * \par - * - \smemreuse + * @tparam OffsetT + * [inferred] Signed integer type for local offsets * - * \tparam OffsetT [inferred] Signed integer type for local offsets + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToStriped( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(input_items, output_items, ranks, Int2Type()); } - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. + * @brief Exchanges data items annotated by rank into striped arrangement. + * Items with rank -1 are not exchanged. + * + * @par + * - @smemreuse + * + * @tparam OffsetT + * [inferred] Signed integer type for local offsets * - * \par - * - \smemreuse + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. * - * \tparam OffsetT [inferred] Signed integer type for local offsets + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks */ template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void + ScatterToStripedGuarded(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -1038,24 +1154,36 @@ public: } } - - - /** - * \brief Exchanges valid data items annotated by rank into striped arrangement. + * @brief Exchanges valid data items annotated by rank into striped arrangement. + * + * @par + * - @smemreuse + * + * @tparam OffsetT + * [inferred] Signed integer type for local offsets * - * \par - * - \smemreuse + * @tparam ValidFlag + * [inferred] FlagT type denoting which items are valid * - * \tparam OffsetT [inferred] Signed integer type for local offsets - * \tparam ValidFlag [inferred] FlagT type denoting which items are valid + * @param[in] input_items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[out] output_items + * Items from exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks + * + * @param[in] is_valid + * Corresponding flag denoting item validity */ template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT (&input_items)[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT (&output_items)[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag (&is_valid)[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + __device__ __forceinline__ void + ScatterToStripedFlagged(InputT (&input_items)[ITEMS_PER_THREAD], + OutputT (&output_items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD], + ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -1084,60 +1212,98 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - __device__ __forceinline__ void StripedToBlocked( - InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + */ + __device__ __forceinline__ void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) { StripedToBlocked(items, items); } - __device__ __forceinline__ void BlockedToStriped( - InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + */ + __device__ __forceinline__ void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD]) { BlockedToStriped(items, items); } - __device__ __forceinline__ void WarpStripedToBlocked( - InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + */ + __device__ __forceinline__ void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD]) { WarpStripedToBlocked(items, items); } - __device__ __forceinline__ void BlockedToWarpStriped( - InputT (&items)[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + */ + __device__ __forceinline__ void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD]) { BlockedToWarpStriped(items, items); } + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks + */ template - __device__ __forceinline__ void ScatterToBlocked( - InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToBlocked(items, items, ranks); } + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks + */ template - __device__ __forceinline__ void ScatterToStriped( - InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks + */ template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + __device__ __forceinline__ void ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStripedGuarded(items, items, ranks); } + /** + * @param[in-out] items + * Items to exchange, converting between striped and blocked arrangements. + * + * @param[in] ranks + * Corresponding scatter ranks + * + * @param[in] is_valid + * Corresponding flag denoting item validity + */ template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT (&items)[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT (&ranks)[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag (&is_valid)[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + __device__ __forceinline__ void ScatterToStripedFlagged(InputT (&items)[ITEMS_PER_THREAD], + OffsetT (&ranks)[ITEMS_PER_THREAD], + ValidFlag (&is_valid)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks, is_valid); } diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh index 949bd2bcb34..ee750077915 100644 --- a/cub/cub/block/block_histogram.cuh +++ b/cub/cub/block/block_histogram.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + * @file + * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for + * constructing block-wide histograms from data samples partitioned across a CUDA thread block. */ #pragma once @@ -53,28 +54,29 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. + * @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of + * block-wide histograms. */ enum BlockHistogramAlgorithm { /** - * \par Overview + * @par Overview * Sorting followed by differentiation. Execution is comprised of two phases: * -# Sort the data using efficient radix sort * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. * - * \par Performance Considerations + * @par Performance Considerations * Delivers consistent throughput regardless of sample bin distribution. */ BLOCK_HISTO_SORT, /** - * \par Overview + * @par Overview * Use atomic addition to update byte counts directly * - * \par Performance Considerations + * @par Performance Considerations * Performance is strongly tied to the hardware implementation of atomic * addition, and may be significantly degraded for non uniformly-random * input distributions where many concurrent updates are likely to be @@ -89,47 +91,68 @@ enum BlockHistogramAlgorithm * Block histogram ******************************************************************************/ - /** - * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) - * \ingroup BlockModule + * @brief The BlockHistogram class provides [collective](index.html#sec0) methods for + * constructing block-wide histograms from data samples partitioned across a CUDA thread + * block. ![](histogram_logo.png) + * + * @ingroup BlockModule + * + * @tparam T + * The sample type being histogrammed (must be castable to an integer bin identifier) + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ITEMS_PER_THREAD + * The number of items per thread + * + * @tparam BINS + * The number bins within the histogram * - * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam BINS The number bins within the histogram - * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @tparam ALGORITHM + * [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm + * to use (default: cub::BLOCK_HISTO_SORT) * - * \par Overview + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * - A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). + * counts the number of observations that fall into each of the disjoint categories (known as + * bins). * - The `T` type must be implicitly castable to an integer type. * - BlockHistogram expects each integral `input[i]` value to satisfy * `0 <= input[i] < BINS`. Values outside of this range result in undefined * behavior. * - BlockHistogram can be optionally specialized to use different algorithms: - * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) - * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref + * cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. + * [More...](\ref cub::BlockHistogramAlgorithm) * - * \par Performance Considerations - * - \granularity + * @par Performance Considerations + * - @granularity * - * \par A Simple Example - * \blockcollective{BlockHistogram} - * \par + * @par A Simple Example + * @blockcollective{BlockHistogram} + * @par * The code snippet below illustrates a 256-bin histogram of 512 integer samples that * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character + * samples each typedef cub::BlockHistogram BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; @@ -144,18 +167,19 @@ enum BlockHistogramAlgorithm * // Compute the block-wide histogram * BlockHistogram(temp_storage).Histogram(data, smem_histogram); * - * \endcode + * @endcode * - * \par Performance and Usage Considerations + * @par Performance and Usage Considerations * - All input values must fall between [0, BINS), or behavior is undefined. * - The histogram output can be constructed in shared or device-accessible memory * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: - * example_block_reduce_dyn_smem.cu + * example_block_reduce_dyn_smem.cu * * This example can be easily adapted to the storage required by BlockHistogram. */ @@ -223,17 +247,17 @@ private: public: - /// \smemstorage{BlockHistogram} + /// @smemstorage{BlockHistogram} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockHistogram() : @@ -241,34 +265,34 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockHistogram( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockHistogram(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Histogram operations + * @name Histogram operations *********************************************************************/ //@{ /** - * \brief Initialize the shared histogram counters to zero. + * @brief Initialize the shared histogram counters to zero. * - * \par Snippet + * @par Snippet * The code snippet below illustrates a the initialization and update of a * histogram of 512 integer samples that are partitioned across 128 threads * where each thread owns 4 samples. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -292,12 +316,13 @@ public: * // Update the block-wide histogram * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); * - * \endcode + * @endcode * - * \tparam CounterT [inferred] Histogram counter type + * @tparam CounterT + * [inferred] Histogram counter type */ - template - __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) + template + __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) { // Initialize histogram bin counts to zeros int histo_offset = 0; @@ -314,25 +339,26 @@ public: } } - /** - * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. + * @brief Constructs a block-wide histogram in shared/device-accessible memory. + * Each thread contributes an array of input elements. * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a 256-bin histogram of 512 integer samples that * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 + * character samples each typedef cub::BlockHistogram + * BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; @@ -347,15 +373,20 @@ public: * // Compute the block-wide histogram * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); * - * \endcode + * @endcode + * + * @tparam CounterT + * [inferred] Histogram counter type + * + * @param[in] items + * Calling thread's input values to histogram * - * \tparam CounterT [inferred] Histogram counter type + * @param[out] histogram + * Reference to shared/device-accessible memory histogram */ - template < - typename CounterT > - __device__ __forceinline__ void Histogram( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + template + __device__ __forceinline__ void Histogram(T (&items)[ITEMS_PER_THREAD], + CounterT histogram[BINS]) { // Initialize histogram bin counts to zeros InitHistogram(histogram); @@ -366,27 +397,27 @@ public: InternalBlockHistogram(temp_storage).Composite(items, histogram); } - - /** - * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. + * @brief Updates an existing block-wide histogram in shared/device-accessible memory. + * Each thread composites an array of input elements. * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a the initialization and update of a * histogram of 512 integer samples that are partitioned across 128 threads * where each thread owns 4 samples. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 + * character samples each typedef cub::BlockHistogram + * BlockHistogram; * * // Allocate shared memory for BlockHistogram * __shared__ typename BlockHistogram::TempStorage temp_storage; @@ -404,15 +435,20 @@ public: * // Update the block-wide histogram * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); * - * \endcode + * @endcode + * + * @tparam CounterT + * [inferred] Histogram counter type + * + * @param[in] items + * Calling thread's input values to histogram * - * \tparam CounterT [inferred] Histogram counter type + * @param[out] histogram + * Reference to shared/device-accessible memory histogram */ - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + template + __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD], + CounterT histogram[BINS]) { InternalBlockHistogram(temp_storage).Composite(items, histogram); } diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh index 5af8abca9e1..3c8bb154c1c 100644 --- a/cub/cub/block/block_load.cuh +++ b/cub/cub/block/block_load.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Operations for reading linear tiles of data into the CUDA thread block. */ @@ -52,34 +52,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIo + * @addtogroup UtilIo * @{ */ /******************************************************************//** - * \name Blocked arrangement I/O (direct) + * @name Blocked arrangement I/O (direct) *********************************************************************/ //@{ - /** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * @brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * @blocked + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \blocked + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { // Load directly in thread-blocked order #pragma unroll @@ -89,25 +99,39 @@ __device__ __forceinline__ void LoadDirectBlocked( } } - /** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. + * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded + * by range. + * + * @blocked + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from * - * \blocked + * @param[out] items + * Data to load * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @param[in] valid_items + * Number of valid items to load */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load +template +__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { #pragma unroll @@ -120,27 +144,43 @@ __device__ __forceinline__ void LoadDirectBlocked( } } - /** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. + * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded + * by range, with a fall-back assignment of out-of-bound elements.. + * + * @blocked + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) * - * \blocked + * @param[in] block_itr + * The thread block's base input iterator for loading from * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +template +__device__ __forceinline__ void LoadDirectBlocked(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -153,16 +193,22 @@ __device__ __forceinline__ void LoadDirectBlocked( #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** - * Internal implementation for load vectorization + * @brief Internal implementation for load vectorization + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_ptr + * Input pointer for loading from + * + * @param[out] items + * Data to load */ -template < - CacheLoadModifier MODIFIER, - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid, + T *block_ptr, + T (&items)[ITEMS_PER_THREAD]) { // Biggest memory access word that T is a whole multiple of typedef typename UnitWord::DeviceWord DeviceWord; @@ -206,28 +252,39 @@ __device__ __forceinline__ void InternalLoadDirectBlockedVectorized( #endif // DOXYGEN_SHOULD_SKIP_THIS - /** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * @brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * @blocked + * + * The input offset (@p block_ptr + @p block_offset) must be quad-item aligned + * + * The following conditions will prevent vectorization and loading will fall back to + * cub::BLOCK_LOAD_DIRECT: + * - @p ITEMS_PER_THREAD is odd + * - The data type @p T is not a built-in primitive or CUDA vector type + * (e.g., @p short, @p int2, @p double, @p float2, etc.) * - * \blocked + * @tparam T + * [inferred] The data type to load. * - * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @param[in] block_ptr + * Input pointer for loading from + * + * @param[out] items + * Data to load */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void LoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid, + T *block_ptr, + T (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } @@ -235,30 +292,41 @@ __device__ __forceinline__ void LoadDirectBlockedVectorized( //@} end member group /******************************************************************//** - * \name Striped arrangement I/O (direct) + * @name Striped arrangement I/O (direct) *********************************************************************/ //@{ - /** - * \brief Load a linear segment of items into a striped arrangement across the thread block. + * @brief Load a linear segment of items into a striped arrangement across the thread block. * - * \striped + * @striped * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @tparam BLOCK_THREADS + * The thread block size in threads + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void LoadDirectStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -267,27 +335,42 @@ __device__ __forceinline__ void LoadDirectStriped( } } - /** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range + * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded + * by range + * + * @striped + * + * @tparam BLOCK_THREADS + * The thread block size in threads + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) * - * \striped + * @param[in] block_itr + * The thread block's base input iterator for loading from * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load +template +__device__ __forceinline__ void LoadDirectStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -299,29 +382,50 @@ __device__ __forceinline__ void LoadDirectStriped( } } - /** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded + * by range, with a fall-back assignment of out-of-bound elements. + * + * @striped + * + * @tparam BLOCK_THREADS + * The thread block size in threads + * + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \striped + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items */ -template < - int BLOCK_THREADS, - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +template +__device__ __forceinline__ void LoadDirectStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) @@ -334,31 +438,41 @@ __device__ __forceinline__ void LoadDirectStriped( //@} end member group /******************************************************************//** - * \name Warp-striped arrangement I/O (direct) + * @name Warp-striped arrangement I/O (direct) *********************************************************************/ //@{ - /** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. + * @brief Load a linear segment of items into a warp-striped arrangement across the thread block. * - * \warpstriped + * @warpstriped * - * \par Usage Considerations + * @par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; @@ -372,28 +486,42 @@ __device__ __forceinline__ void LoadDirectWarpStriped( } } - /** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range + * @brief Load a linear segment of items into a warp-striped arrangement across the thread block, + * guarded by range * - * \warpstriped + * @warpstriped * - * \par Usage Considerations + * @par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load +template +__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; @@ -410,30 +538,46 @@ __device__ __forceinline__ void LoadDirectWarpStriped( } } - /** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * @brief Load a linear segment of items into a warp-striped arrangement across the thread block, + * guarded by range, with a fall-back assignment of out-of-bound elements. * - * \warpstriped + * @warpstriped * - * \par Usage Considerations + * @par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. + * @tparam T + * [inferred] The data type to load. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam InputIteratorT + * [inferred] The random-access iterator type for input \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items +template +__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid, + InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { // Load directly in warp-striped order #pragma unroll @@ -456,64 +600,65 @@ __device__ __forceinline__ void LoadDirectWarpStriped( //----------------------------------------------------------------------------- /** - * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + * @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a + * linear segment of data from memory into a blocked arrangement across a CUDA thread block. */ enum BlockLoadAlgorithm { /** - * \par Overview + * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * directly from memory. * - * \par Performance Considerations + * @par Performance Considerations * The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ BLOCK_LOAD_DIRECT, /** - * \par Overview + * @par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * directly from memory. * - * \par Performance Considerations + * @par Performance Considerations * The utilization of memory transactions (coalescing) doesn't depend on * the number of items per thread. */ BLOCK_LOAD_STRIPED, /** - * \par Overview + * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is read * from memory using CUDA's built-in vectorized loads as a coalescing optimization. * For example, ld.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the * access stride between threads (i.e., the number items per thread) exceeds the * maximum vector load width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and loading will fall * back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p InputIteratorT is not a simple pointer type + * - @p ITEMS_PER_THREAD is odd + * - The @p InputIteratorT is not a simple pointer type * - The block input offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type - * (e.g., \p short, \p int2, \p double, \p float2, etc.) + * - The data type @p T is not a built-in primitive or CUDA vector type + * (e.g., @p short, @p int2, @p double, @p float2, etc.) */ BLOCK_LOAD_VECTORIZE, /** - * \par Overview + * @par Overview * * A [striped arrangement](index.html#sec5sec3) of data is read * efficiently from memory and then locally transposed into a * [blocked arrangement](index.html#sec5sec3). * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - The local reordering incurs slightly longer latencies and throughput than the @@ -522,16 +667,16 @@ enum BlockLoadAlgorithm BLOCK_LOAD_TRANSPOSE, /** - * \par Overview + * @par Overview * * A [warp-striped arrangement](index.html#sec5sec3) of data is * read efficiently from memory and then locally transposed into a * [blocked arrangement](index.html#sec5sec3). * - * \par Usage Considerations + * @par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - The local reordering incurs slightly larger latencies than the @@ -542,18 +687,18 @@ enum BlockLoadAlgorithm BLOCK_LOAD_WARP_TRANSPOSE, /** - * \par Overview + * @par Overview * - * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) + * Like @p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) * of data is read directly from memory and then is locally transposed into a * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory * requirement, only one warp's worth of shared memory is provisioned and is * subsequently time-sliced among warps. * - * \par Usage Considerations + * @par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items loaded per thread. * - Provisions less shared memory temporary storage, but incurs larger @@ -564,27 +709,49 @@ enum BlockLoadAlgorithm /** - * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. - * - * \par Overview + * @brief The BlockLoad class provides [collective](index.html#sec0) + * data movement methods for loading a linear segment of items from memory + * into a [blocked arrangement](index.html#sec5sec3) across a + * CUDA thread block. ![](block_load_logo.png) + * + * @ingroup BlockModule + * + * @ingroup UtilIo + * + * @tparam InputT + * The data type to read into (which must be convertible from the input iterator's value type). + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ITEMS_PER_THREAD + * The number of consecutive items partitioned onto each thread. + * + * @tparam ALGORITHM + * [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. + * + * @tparam WARP_TIME_SLICING + * [optional] Whether or not only one warp's worth of shared memory should be + * allocated and time-sliced among block-warps during any load-related data transpositions + * (versus each warp having its own storage). (default: false) + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * - The BlockLoad class provides a single data movement abstraction that can be specialized * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different * performance policies for different architectures, data types, granularity sizes, etc. * - BlockLoad can be optionally specialized by different data movement strategies: * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) -* -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) + * -# cub::BLOCK_LOAD_STRIPED,. A [striped arrangement](index.html#sec5sec3) * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is read directly from memory using CUDA's built-in vectorized loads as a @@ -600,16 +767,16 @@ enum BlockLoadAlgorithm * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) * - \rowmajor * - * \par A Simple Example - * \blockcollective{BlockLoad} - * \par + * @par A Simple Example + * @blockcollective{BlockLoad} + * @par * The code snippet below illustrates the loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -624,13 +791,13 @@ enum BlockLoadAlgorithm * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data); * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be + * @endcode + * @par + * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of @p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: @@ -691,32 +858,63 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory + /** + * @brief Load a linear segment of items from memory + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } - /// Load a linear segment of items from memory, guarded by range + /** + * @brief Load a linear segment of items from memory, guarded by range + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } @@ -744,32 +942,63 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory + /** + * @brief Load a linear segment of items from memory + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); } - /// Load a linear segment of items from memory, guarded by range + /** + * @brief Load a linear segment of items from memory, guarded by range + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); } - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); } @@ -797,62 +1026,115 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + /** + * @brief Load a linear segment of items from memory, specialized for native pointer types + * (attempts vectorization) + * + * @param[in] block_ptr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(InputT *block_ptr, InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + /** + * @brief Load a linear segment of items from memory, specialized for native pointer types + * (attempts vectorization) + * + * @param[in] block_ptr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(const InputT *block_ptr, + InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); } - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template < - CacheLoadModifier MODIFIER, - typename ValueType, - typename OffsetT> - __device__ __forceinline__ void Load( - CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + /** + * @brief Load a linear segment of items from memory, specialized for native pointer types + * (attempts vectorization) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ + template + __device__ __forceinline__ void + Load(CacheModifiedInputIterator block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); } - /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) + /** + * @brief Load a linear segment of items from memory, specialized for opaque input iterators + * (skips vectorization) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(_InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectBlocked(linear_tid, block_itr, items); } - /// Load a linear segment of items from memory, guarded by range (skips vectorization) + /** + * @brief Load a linear segment of items from memory, guarded by range (skips vectorization) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items); } - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements (skips vectorization) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); } @@ -891,34 +1173,65 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory + /** + * @brief Load a linear segment of items from memory + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).StripedToBlocked(items, items); } - /// Load a linear segment of items from memory, guarded by range + /** + * @brief Load a linear segment of items from memory, guarded by range + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).StripedToBlocked(items, items); } - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).StripedToBlocked(items, items); @@ -966,35 +1279,65 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory + /** + * @brief Load a linear segment of items from memory + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } - /// Load a linear segment of items from memory, guarded by range + /** + * @brief Load a linear segment of items from memory, guarded by range + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); @@ -1041,35 +1384,65 @@ private: linear_tid(linear_tid) {} - /// Load a linear segment of items from memory + /** + * @brief Load a linear segment of items from memory + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { LoadDirectWarpStriped(linear_tid, block_itr, items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } - /// Load a linear segment of items from memory, guarded by range + /** + * @brief Load a linear segment of items from memory, guarded by range + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + /** + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load + * + * @param[in] oob_default + * Default value to assign out-of-bound items + */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); BlockExchange(temp_storage).WarpStripedToBlocked(items, items); @@ -1113,17 +1486,18 @@ private: public: - /// \smemstorage{BlockLoad} + /// @smemstorage{BlockLoad} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary + * storage. */ __device__ __forceinline__ BlockLoad() : @@ -1131,15 +1505,15 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockLoad( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockLoad(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} @@ -1147,26 +1521,25 @@ public: //@} end member group /******************************************************************//** - * \name Data movement + * @name Data movement *********************************************************************/ //@{ - /** - * \brief Load a linear segment of items from memory. + * @brief Load a linear segment of items from memory. * - * \par - * - \blocked - * - \smemreuse + * @par + * - @blocked + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * thread owns 4 consecutive items. The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -1181,37 +1554,40 @@ public: * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data); * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be + * @endcode + * @par + * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of @p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD]) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items); } - /** - * \brief Load a linear segment of items from memory, guarded by range. + * @brief Load a linear segment of items from memory, guarded by range. * - * \par - * - \blocked - * - \smemreuse + * @par + * - @blocked + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the guarded loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * thread owns 4 consecutive items. The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) @@ -1226,39 +1602,46 @@ public: * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. - * The set of \p thread_data across the block of threads in those threads will be + * @endcode + * @par + * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, 6... and @p valid_items is @p 5. + * The set of @p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads * being unmasked to load portions of valid data (and other items remaining unassigned). * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); } - /** - * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + * @brief Load a linear segment of items from memory, guarded by range, with a fall-back + * assignment of out-of-bound elements * - * \par - * - \blocked - * - \smemreuse + * @par + * - @blocked + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the guarded loading of a linear * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * thread owns 4 consecutive items. The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE, * meaning memory references are efficiently coalesced using a warp-striped access * pattern (after which items are locally reordered among threads). - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) @@ -1273,21 +1656,31 @@ public: * int thread_data[4]; * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., - * \p valid_items is \p 5, and the out-of-bounds default is \p -1. - * The set of \p thread_data across the block of threads in those threads will be + * @endcode + * @par + * Suppose the input @p d_data is 0, 1, 2, 3, 4, 5, 6..., + * @p valid_items is @p 5, and the out-of-bounds default is @p -1. + * The set of @p thread_data across the block of threads in those threads will be * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads - * being unmasked to load portions of valid data (and other items are assigned \p -1) + * being unmasked to load portions of valid data (and other items are assigned @p -1) + * + * @param[in] block_itr + * The thread block's base input iterator for loading from + * + * @param[out] items + * Data to load + * + * @param[in] valid_items + * Number of valid items to load * + * @param[in] oob_default + * Default value to assign out-of-bound items */ template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items + __device__ __forceinline__ void Load(InputIteratorT block_itr, + InputT (&items)[ITEMS_PER_THREAD], + int valid_items, + DefaultT oob_default) { InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); } diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh index 8495947789c..13655a5548c 100644 --- a/cub/cub/block/block_radix_rank.cuh +++ b/cub/cub/block/block_radix_rank.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block */ @@ -54,11 +54,10 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the - * keys from a single tile. Note that different ranking algorithms require different - * initial arrangements of keys to function properly. + * @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the + * keys from a single tile. Note that different ranking algorithms require different + * initial arrangements of keys to function properly. */ enum RadixRankAlgorithm { @@ -130,31 +129,53 @@ struct warp_in_block_matcher_t } // namespace detail #endif // DOXYGEN_SHOULD_SKIP_THIS - /** - * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. - * \ingroup BlockModule + * @brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread + * block. + * + * @ingroup BlockModule + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam RADIX_BITS + * The number of radix bits per digit place + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam MEMOIZE_OUTER_SCAN + * [optional] Whether or not to buffer outer raking scan + * partials to incur fewer shared memory reads at the expense of higher register pressure + * (default: true for architectures SM35 and newer, false otherwise). + * See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. * - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam RADIX_BITS The number of radix bits per digit place - * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low - * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. - * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) - * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @tparam INNER_SCAN_ALGORITHM + * [optional] The cub::BlockScanAlgorithm algorithm to use (default: + * cub::BLOCK_SCAN_WARP_SCANS) * - * \par Overview + * @tparam SMEM_CONFIG + * [optional] Shared memory bank mode (default: @p cudaSharedMemBankSizeFourByte) + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * Blah... * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). - * - \blocked + * - @blocked * - * \par Performance Considerations - * - \granularity + * @par Performance Considerations + * - @granularity * - * \par - * \code + * @par + * @code * #include * * __global__ void ExampleKernel(...) @@ -163,6 +184,7 @@ struct warp_in_block_matcher_t * constexpr int radix_bits = 5; * * // Specialize BlockRadixRank for a 1D block of 2 threads + * // Specialize BlockRadixRank for a 1D block of 2 threads * using block_radix_rank = cub::BlockRadixRank; * using storage_t = typename block_radix_rank::TempStorage; * @@ -178,11 +200,11 @@ struct warp_in_block_matcher_t * block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor); * * ... - * \endcode + * @endcode * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`. * The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`. * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: @@ -236,7 +258,8 @@ private: PACKING_RATIO = static_cast(sizeof(PackedCounter) / sizeof(DigitCounter)), LOG_PACKING_RATIO = Log2::VALUE, - LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), // Always at least one lane + // Always at least one lane + LOG_COUNTER_LANES = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0), COUNTER_LANES = 1 << LOG_COUNTER_LANES, // The number of packed counters per thread (plus one for padding) @@ -414,17 +437,18 @@ private: public: - /// \smemstorage{BlockScan} + /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary + * storage. */ __device__ __forceinline__ BlockRadixRank() : @@ -432,35 +456,40 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockRadixRank( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockRadixRank(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Raking + * @name Raking *********************************************************************/ //@{ /** - * \brief Rank keys. + * @brief Rank keys. + * + * @param[in] keys + * Keys for this tile + * + * @param[out] ranks + * For each key, the local rank within the tile + * + * @param[in] digit_extractor + * The digit extractor */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD, - typename DigitExtractorT> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - DigitExtractorT digit_extractor) ///< [in] The digit extractor + template + __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor) { static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size, "DigitCounter type is too small to hold this number of keys"); @@ -515,19 +544,30 @@ public: } } - /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are + * provided for the corresponding thread. + * + * @param[in] keys + * Keys for this tile + * + * @param[out] ranks + * For each key, the local rank within the tile (out parameter) + * + * @param[in] digit_extractor + * The digit extractor + * + * @param[out] exclusive_digit_prefix + * The exclusive prefix sum for the digits + * [(threadIdx.x * BINS_TRACKED_PER_THREAD) + * ... + * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD, - typename DigitExtractorT> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - DigitExtractorT digit_extractor, ///< [in] The digit extractor - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + template + __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) { static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size, "DigitCounter type is too small to hold this number of keys"); @@ -654,36 +694,37 @@ private: public: - /// \smemstorage{BlockScan} + /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockRadixRankMatch( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockRadixRankMatch(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Raking + * @name Raking *********************************************************************/ //@{ - /** \brief Computes the count of keys for each digit value, and calls the - * callback with the array of key counts. - + /** + * @brief Computes the count of keys for each digit value, and calls the + * callback with the array of key counts. + * * @tparam CountsCallback The callback type. It should implement an instance * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins * is an array of key counts for each digit value distributed in block @@ -726,18 +767,25 @@ public: } /** - * \brief Rank keys. + * @brief Rank keys. + * + * @param[in] keys + * Keys for this tile + * + * @param[out] ranks + * For each key, the local rank within the tile + * + * @param[in] digit_extractor + * The digit extractor */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD, - typename DigitExtractorT, - typename CountsCallback> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - DigitExtractorT digit_extractor, ///< [in] The digit extractor - CountsCallback callback) + template + __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + CountsCallback callback) { // Initialize shared digit counters @@ -840,19 +888,33 @@ public: } /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are + * provided for the corresponding thread. + * + * @param[in] keys + * Keys for this tile + * + * @param[out] ranks + * For each key, the local rank within the tile (out parameter) + * + * @param[in] digit_extractor + * The digit extractor + * + * @param[out] exclusive_digit_prefix + * The exclusive prefix sum for the digits + * [(threadIdx.x * BINS_TRACKED_PER_THREAD) + * ... + * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD, - typename DigitExtractorT, - typename CountsCallback> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - DigitExtractorT digit_extractor, ///< [in] The digit extractor - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD], ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - CountsCallback callback) + template + __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD], + CountsCallback callback) { RankKeys(keys, ranks, digit_extractor, callback); @@ -872,15 +934,24 @@ public: } } - template < - typename UnsignedBits, - int KEYS_PER_THREAD, - typename DigitExtractorT> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - DigitExtractorT digit_extractor, - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + /** + * @param[in] keys + * Keys for this tile + * + * @param[out] ranks + * For each key, the local rank within the tile (out parameter) + * + * @param[out] exclusive_digit_prefix + * The exclusive prefix sum for the digits + * [(threadIdx.x * BINS_TRACKED_PER_THREAD) + * ... + * (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] + */ + template + __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD], + int (&ranks)[KEYS_PER_THREAD], + DigitExtractorT digit_extractor, + int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) { RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix, BlockRadixRankEmptyCallback()); @@ -1158,7 +1229,8 @@ struct BlockRadixRankMatchEarlyCounts (TempStorage& temp_storage) : temp_storage(temp_storage) {} /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are + * provided for the corresponding thread. */ template diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh index 1a98a66f646..7b1522a6743 100644 --- a/cub/cub/block/block_radix_sort.cuh +++ b/cub/cub/block/block_radix_sort.cuh @@ -27,11 +27,11 @@ ******************************************************************************/ /** - * \file - * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. + * @file + * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix + * sorting of items partitioned across a CUDA thread block. */ - #pragma once #include "../config.cuh" @@ -376,16 +376,35 @@ private: Int2Type /*is_blocked*/) {} - /// Sort blocked arrangement + /** + * @brief Sort blocked arrangement + * + * @param keys + * Keys to sort + * + * @param values + * Values to sort + * + * @param begin_bit + * The beginning (least-significant) bit index needed for key comparison + * + * @param end_bit + * The past-the-end (most-significant) bit index needed for key comparison + * + * @param is_descending + * Tag whether is a descending-order sort + * + * @param is_keys_only + * Tag whether is keys-only sort + */ template - __device__ __forceinline__ void SortBlocked( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only, ///< Tag whether is keys-only sort - DecomposerT decomposer = {}) + __device__ __forceinline__ void SortBlocked(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit, + int end_bit, + Int2Type is_descending, + Int2Type is_keys_only, + DecomposerT decomposer = {}) { bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); @@ -433,16 +452,35 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /// Sort blocked -> striped arrangement + /** + * @brief Sort blocked -> striped arrangement + * + * @param keys + * Keys to sort + * + * @param values + * Values to sort + * + * @param begin_bit + * The beginning (least-significant) bit index needed for key comparison + * + * @param end_bit + * The past-the-end (most-significant) bit index needed for key comparison + * + * @param is_descending + * Tag whether is a descending-order sort + * + * @param is_keys_only + * Tag whether is keys-only sort + */ template - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only, ///< Tag whether is keys-only sort - DecomposerT decomposer = {}) + __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit, + int end_bit, + Int2Type is_descending, + Int2Type is_keys_only, + DecomposerT decomposer = {}) { bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD] = reinterpret_cast(keys); @@ -498,17 +536,17 @@ public: #endif // DOXYGEN_SHOULD_SKIP_THIS - /// \smemstorage{BlockRadixSort} + /// @smemstorage{BlockRadixSort} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockRadixSort() : @@ -516,37 +554,38 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockRadixSort( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockRadixSort(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Sorting (blocked arrangements) + * @name Sorting (blocked arrangements) *********************************************************************/ //@{ /** - * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * @brief Performs an ascending block-wide radix sort over a [blocked + * arrangement](index.html#sec5sec3) of keys. * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -564,17 +603,25 @@ public: * // Collectively sort the keys * BlockRadixSort(temp_storage).Sort(thread_keys); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be + * The corresponding output @p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + * @param[in-out] keys + * Keys to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; @@ -712,29 +759,30 @@ public: } /** - * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * @brief Performs an ascending block-wide radix sort across a + * [blocked arrangement](index.html#sec5sec3) of keys and values. * - * \par + * @par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and + * values each typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; @@ -747,19 +795,29 @@ public: * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in-out] values + * Values to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } @@ -919,18 +977,19 @@ public: } /** - * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * @brief Performs a descending block-wide radix sort over a + * [blocked arrangement](index.html#sec5sec3) of keys. * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive keys. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -948,17 +1007,25 @@ public: * // Collectively sort the keys * BlockRadixSort(temp_storage).Sort(thread_keys); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be + * The corresponding output @p thread_keys in those threads will be * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + * + * @param[in-out] keys + * Keys to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; @@ -1107,29 +1174,30 @@ public: } /** - * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * @brief Performs a descending block-wide radix sort across a [blocked + * arrangement](index.html#sec5sec3) of keys and values. * - * \par + * @par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive pairs. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and + * values each typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; @@ -1142,19 +1210,29 @@ public: * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in-out] values + * Values to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } @@ -1317,24 +1395,25 @@ public: //@} end member group /******************************************************************//** - * \name Sorting (blocked arrangement -> striped arrangement) + * @name Sorting (blocked arrangement -> striped arrangement) *********************************************************************/ //@{ - /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * @brief Performs an ascending radix sort across a + * [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a + * [striped arrangement](index.html#sec5sec3). * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 + * threads where each thread owns 4 consecutive keys. The final partitioning is striped. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1352,18 +1431,25 @@ public: * // Collectively sort the keys * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; @@ -1514,29 +1600,31 @@ public: } /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * @brief Performs an ascending radix sort across a + * [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them + * in a [striped arrangement](index.html#sec5sec3). * - * \par + * @par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 + * threads where each thread owns 4 consecutive pairs. The final partitioning is striped. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and + * values each typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; @@ -1549,19 +1637,29 @@ public: * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in-out] values + * Values to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } @@ -1715,18 +1813,20 @@ public: } /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * @brief Performs a descending radix sort across a + * [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a + * [striped arrangement](index.html#sec5sec3). * - * \par - * - \granularity - * - \smemreuse + * @par + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 + * threads where each thread owns 4 consecutive keys. The final partitioning is striped. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1744,18 +1844,25 @@ public: * // Collectively sort the keys * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { NullType values[ITEMS_PER_THREAD]; @@ -1906,29 +2013,31 @@ public: } /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * @brief Performs a descending radix sort across a + * [blocked arrangement](index.html#sec5sec3) of keys and values, + * leaving them in a [striped arrangement](index.html#sec5sec3). * - * \par + * @par * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" * more than one tile of values, simply perform a key-value sort of the keys paired * with a temporary value array that enumerates the key indices. The reordered indices * can then be used as a gather-vector for exchanging other associated tile data through * shared memory. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 + * threads where each thread owns 4 consecutive pairs. The final partitioning is striped. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and + * values each typedef cub::BlockRadixSort BlockRadixSort; * * // Allocate shared memory for BlockRadixSort * __shared__ typename BlockRadixSort::TempStorage temp_storage; @@ -1941,19 +2050,30 @@ public: * // Collectively sort the keys and values among block threads * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be + * corresponding output @p thread_keys in those threads will be * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. * + * @param[in-out] keys + * Keys to sort + * + * @param[in-out] values + * Values to sort + * + * @param[in] begin_bit + * [optional] The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * [optional] The past-the-end (most-significant) bit index needed for key comparison */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + __device__ __forceinline__ void + SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD], + ValueT (&values)[ITEMS_PER_THREAD], + int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8) { SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); } @@ -2115,4 +2235,3 @@ public: */ CUB_NAMESPACE_END - diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh index fbe332c9e07..0f7588d8881 100644 --- a/cub/cub/block/block_raking_layout.cuh +++ b/cub/cub/block/block_raking_layout.cuh @@ -27,11 +27,11 @@ ******************************************************************************/ /** - * \file - * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. + * @file + * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking + * across thread block data. */ - #pragma once #include "../config.cuh" @@ -47,19 +47,26 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) - * \ingroup BlockModule + * @brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking + * across thread block data. ![](raking.png) + * + * @ingroup BlockModule * - * \par Overview + * @par Overview * This type facilitates a shared memory usage pattern where a block of CUDA * threads places elements into shared memory and then reduces the active * parallelism to one "raking" warp of threads for serially aggregating consecutive * sequences of shared items. Padding is inserted to eliminate bank conflicts * (for most data types). * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_THREADS The thread block size in threads. - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @tparam T + * The data type to be exchanged. + * + * @tparam BLOCK_THREADS + * The thread block size in threads. + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. */ template < typename T, @@ -105,7 +112,7 @@ struct BlockRakingLayout /** - * \brief Shared memory storage type + * @brief Shared memory storage type */ struct __align__(16) _TempStorage { @@ -117,7 +124,7 @@ struct BlockRakingLayout /** - * \brief Returns the location for the calling thread to place data into the grid + * @brief Returns the location for the calling thread to place data into the grid */ static __device__ __forceinline__ T* PlacementPtr( TempStorage &temp_storage, @@ -138,7 +145,7 @@ struct BlockRakingLayout /** - * \brief Returns the location for the calling thread to begin sequential raking + * @brief Returns the location for the calling thread to begin sequential raking */ static __device__ __forceinline__ T* RakingPtr( TempStorage &temp_storage, diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh index 8e0814604e0..6c7a4352ee7 100644 --- a/cub/cub/block/block_reduce.cuh +++ b/cub/cub/block/block_reduce.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + * @file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing + * a parallel reduction of items partitioned across a CUDA thread block. */ #pragma once @@ -64,11 +65,11 @@ enum BlockReduceAlgorithm { /** - * \par Overview + * @par Overview * An efficient "raking" reduction algorithm that only supports commutative * reduction operators (true for most operations, e.g., addition). * - * \par + * @par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Threads in warps other than the first warp place @@ -77,11 +78,11 @@ enum BlockReduceAlgorithm * warp continue to accumulate by raking across segments of shared partial reductions * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * - * \par - * \image html block_reduce.png + * @par + * @image html block_reduce.png *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* - * \par Performance Considerations + * @par Performance Considerations * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE * and is preferable when the reduction operator is commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall @@ -93,12 +94,12 @@ enum BlockReduceAlgorithm /** - * \par Overview + * @par Overview * An efficient "raking" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. \blocked. * - * \par + * @par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction @@ -107,11 +108,11 @@ enum BlockReduceAlgorithm * single warp rake across segments of shared partial reductions. * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * - * \par - * \image html block_reduce.png + * @par + * @image html block_reduce.png *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* - * \par Performance Considerations + * @par Performance Considerations * - This variant performs more communication than BLOCK_REDUCE_RAKING * and is only preferable when the reduction operator is non-commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall @@ -123,12 +124,12 @@ enum BlockReduceAlgorithm /** - * \par Overview + * @par Overview * A quick "tiled warp-reductions" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. * - * \par + * @par * Execution is comprised of four phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction @@ -138,11 +139,11 @@ enum BlockReduceAlgorithm * -# A propagation phase where the warp reduction outputs in each warp are * updated with the aggregate from each preceding warp. * - * \par - * \image html block_scan_warpscans.png + * @par + * @image html block_scan_warpscans.png *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* - * \par Performance Considerations + * @par Performance Considerations * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall * throughput across the GPU. However turn-around latency may be lower and @@ -157,43 +158,67 @@ enum BlockReduceAlgorithm ******************************************************************************/ /** - * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) - * \ingroup BlockModule + * @brief The BlockReduce class provides [collective](index.html#sec0) + * methods for computing a parallel reduction of items partitioned across + * a CUDA thread block. ![](reduce_logo.png) * - * \tparam T Data type being reduced - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @ingroup BlockModule * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - \rowmajor - * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: - * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * @tparam T + * Data type being reduced * - * \par Performance Considerations - * - \granularity + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ALGORITHM + * [optional] cub::BlockReduceAlgorithm enumerator specifying + * the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview + * - A reduction + * (or fold) uses a binary combining operator to compute a single aggregate from a list of + * input elements. + * - @rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different + * latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. + * An efficient "raking" reduction algorithm that only + * supports commutative reduction operators. + * [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. + * An efficient "raking" reduction algorithm that supports commutative and + * non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. + * A quick "tiled warp-reductions" reduction algorithm that supports commutative and + * non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * @par Performance Considerations + * - @granularity * - Very efficient (only one synchronization barrier). * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Summation (vs. generic reduction) - * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - @p BLOCK_THREADS is a multiple of the architecture's warp size * - Every thread has a valid input (i.e., full vs. partial-tiles) * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives * - * \par A Simple Example - * \blockcollective{BlockReduce} - * \par + * @par A Simple Example + * @blockcollective{BlockReduce} + * @par * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -211,9 +236,9 @@ enum BlockReduceAlgorithm * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * - * \endcode + * @endcode * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: @@ -282,17 +307,17 @@ private: public: - /// \smemstorage{BlockReduce} + /// @smemstorage{BlockReduce} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockReduce() : @@ -300,38 +325,38 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockReduce(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Generic reductions + * @name Generic reductions *********************************************************************/ //@{ - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * @brief Computes a block-wide reduction for thread0 using the specified binary + * reduction functor. Each thread contributes one input element. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a max reduction of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -349,33 +374,39 @@ public: * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * - * \endcode + * @endcode + * + * @tparam ReductionOp + * [inferred] Binary reduction functor type having member + * T operator()(const T &a, const T &b) * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * @param[in] input + * Calling thread's input + * + * @param[in] reduction_op + * Binary reduction functor */ template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction functor + __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op) { return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); } - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * @brief Computes a block-wide reduction for thread0 using the specified binary + * reduction functor. Each thread contributes an array of consecutive input elements. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a max reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -393,37 +424,43 @@ public: * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * - * \endcode + * @endcode + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * @tparam ReductionOp + * [inferred] Binary reduction functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] inputs + * Calling thread's input segment + * + * @param[in] reduction_op + * Binary reduction functor */ - template < - int ITEMS_PER_THREAD, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment - ReductionOp reduction_op) ///< [in] Binary reduction functor + template + __device__ __forceinline__ T Reduce(T (&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op) { // Reduce partials T partial = internal::ThreadReduce(inputs, reduction_op); return Reduce(partial, reduction_op); } - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * @brief Computes a block-wide reduction for thread0 using the specified binary + * reduction functor. The first @p num_valid threads each contribute one input element. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code + * - @rowmajor + * - @smemreuse + * + * @par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items + * that are partitioned across 128 threads. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) @@ -441,15 +478,23 @@ public: * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); * - * \endcode + * @endcode + * + * @tparam ReductionOp + * [inferred] Binary reduction functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * @param[in] reduction_op + * Binary reduction functor + * + * @param[in] num_valid + * Number of threads containing valid elements (may be less than BLOCK_THREADS) */ template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction functor - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid) { // Determine if we skip bounds checking if (num_valid >= BLOCK_THREADS) @@ -465,24 +510,24 @@ public: //@} end member group /******************************************************************//** - * \name Summation reductions + * @name Summation reductions *********************************************************************/ //@{ - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * @brief Computes a block-wide reduction for thread0 using addition (+) + * as the reduction operator. Each thread contributes one input element. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sum reduction of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -500,29 +545,32 @@ public: * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * - * \endcode + * @endcode * + * @param[in] input + * Calling thread's input */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input + __device__ __forceinline__ T Sum(T input) { return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); } /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * @brief Computes a block-wide reduction for thread0 using addition (+) + * as the reduction operator. Each thread contributes an array of consecutive input + * elements. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -540,33 +588,37 @@ public: * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * - * \endcode + * @endcode * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @param[in] inputs + * Calling thread's input segment */ template - __device__ __forceinline__ T Sum( - T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + __device__ __forceinline__ T Sum(T (&inputs)[ITEMS_PER_THREAD]) { // Reduce partials T partial = internal::ThreadReduce(inputs, cub::Sum()); return Sum(partial); } - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * @brief Computes a block-wide reduction for thread0 using addition (+) + * as the reduction operator. The first @p num_valid threads each contribute one input + * element. * - * \par + * @par * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code + * - @rowmajor + * - @smemreuse + * + * @par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items + * that are partitioned across 128 threads. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) @@ -585,12 +637,15 @@ public: * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); * - * \endcode + * @endcode + * + * @param[in] input + * Calling thread's input * + * @param[in] num_valid + * Number of threads containing valid elements (may be less than BLOCK_THREADS) */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + __device__ __forceinline__ T Sum(T input, int num_valid) { // Determine if we skip bounds checking if (num_valid >= BLOCK_THREADS) @@ -608,7 +663,7 @@ public: }; /** - * \example example_block_reduce.cu + * @example example_block_reduce.cu */ CUB_NAMESPACE_END diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh index c14d36fa015..4e26f641f2f 100644 --- a/cub/cub/block/block_run_length_decode.cuh +++ b/cub/cub/block/block_run_length_decode.cuh @@ -47,19 +47,20 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given - * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output - * array. - * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded - * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows - * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * - * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. + * @brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That + * is, given the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] + * many times in the output array. Due to the nature of the run-length decoding algorithm + * ("decompression"), the output size of the run-length decoded array is runtime-dependent and + * potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a + * "window" from the run-length decoded array. The window's offset can be specified and + * BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from + * the specified window will be returned. * - * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). - * A run of length zero may not be followed by a run length that is not zero. + * @note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the + * run_lengths array). A run of length zero may not be followed by a run length that is not zero. * - * \par - * \code + * @par + * @code * __global__ void ExampleKernel(...) * { * // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t @@ -107,23 +108,38 @@ CUB_NAMESPACE_BEGIN * ... * } * } - * \endcode - * \par - * Suppose the set of input \p run_values across the block of threads is + * @endcode + * @par + * Suppose the set of input @p run_values across the block of threads is * { [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] } and - * \p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. - * The corresponding output \p decoded_items in those threads will be { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], - * [4, 4, 4, 5], ..., [169, 169, 170, 171] } and \p relative_offsets will be { [0, 0, 1, 0], [1, 2, 0, 1], [2, - * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the first iteration of the while loop. + * @p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. + * The corresponding output @p decoded_items in those threads will be + * { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] } + * and @p relative_offsets will be + * { [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the + * first iteration of the while loop. * - * \tparam ItemT The data type of the items being run-length decoded - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes - * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds - * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the - * runs' lengths) - * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension - * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension + * @tparam ItemT + * The data type of the items being run-length decoded + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam RUNS_PER_THREAD + * The number of consecutive runs that each thread contributes + * + * @tparam DECODED_ITEMS_PER_THREAD + * The maximum number of decoded items that each thread holds + * + * @tparam DecodedOffsetT + * Type used to index into the block's decoded items (large enough to hold the sum over all the + * runs' lengths) + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension */ template - __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key + template + __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, + OffsetT num_items, + T val) { OffsetT lower_bound = 0; OffsetT upper_bound = num_items; diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh index 6629fc8b087..90f60e5ea24 100644 --- a/cub/cub/block/block_scan.cuh +++ b/cub/cub/block/block_scan.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + * @file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a + * parallel prefix sum/scan of items partitioned across a CUDA thread block. */ #pragma once @@ -54,13 +55,14 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + * @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a + * parallel prefix scan across a CUDA thread block. */ enum BlockScanAlgorithm { /** - * \par Overview + * @par Overview * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. @@ -68,11 +70,11 @@ enum BlockScanAlgorithm * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. * - * \par - * \image html block_scan_raking.png + * @par + * @image html block_scan_raking.png *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* - * \par Performance Considerations + * @par Performance Considerations * - Although this variant may suffer longer turnaround latencies when the * GPU is under-occupied, it can often provide higher overall throughput * across the GPU when suitably occupied. @@ -81,7 +83,7 @@ enum BlockScanAlgorithm /** - * \par Overview + * @par Overview * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at * the expense of higher register pressure. Raking threads preserve their * "upsweep" segment of values in registers while performing warp-synchronous @@ -91,18 +93,18 @@ enum BlockScanAlgorithm /** - * \par Overview + * @par Overview * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. * - * \par - * \image html block_scan_warpscans.png + * @par + * @image html block_scan_warpscans.png *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
* - * \par Performance Considerations + * @par Performance Considerations * - Although this variant may suffer lower overall throughput across the * GPU because due to a heavy reliance on inefficient warpscans, it can * often provide lower turnaround latencies when the GPU is under-occupied. @@ -116,50 +118,70 @@ enum BlockScanAlgorithm ******************************************************************************/ /** - * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) - * \ingroup BlockModule + * @brief The BlockScan class provides [collective](index.html#sec0) methods for + * computing a parallel prefix sum/scan of items partitioned across a + * CUDA thread block. ![](block_scan_logo.png) * - * \tparam T Data type being scanned - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @ingroup BlockModule * - * \par Overview + * @tparam T + * Data type being scanned + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ALGORITHM + * [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) * produces an output list where each element is computed to be the reduction * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * connotes a prefix scan with the addition operator. The term @em inclusive indicates * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into + * The term @em exclusive indicates the ith input is not incorporated into * the ith output reduction. * - \rowmajor * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: - * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING. + * An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. + * [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. + * Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional + * register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. + * A quick (low latency) "tiled warpscans" prefix scan algorithm. + * [More...](\ref cub::BlockScanAlgorithm) * - * \par Performance Considerations - * - \granularity - * - Uses special instructions when applicable (e.g., warp \p SHFL) + * @par Performance Considerations + * - @granularity + * - Uses special instructions when applicable (e.g., warp @p SHFL) * - Uses synchronization-free communication between warp lanes when applicable * - Invokes a minimal number of minimal block-wide synchronization barriers (only * one or two depending on algorithm selection) * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Prefix sum variants (vs. generic scan) - * - \blocksize + * - @blocksize * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives * - * \par A Simple Example - * \blockcollective{BlockScan} - * \par + * @par A Simple Example + * @blockcollective{BlockScan} + * @par * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -177,14 +199,14 @@ enum BlockScanAlgorithm * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: @@ -265,17 +287,17 @@ private: ******************************************************************************/ public: - /// \smemstorage{BlockScan} + /// @smemstorage{BlockScan} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockScan() : @@ -283,39 +305,41 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockScan(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Exclusive prefix sum operations + * @name Exclusive prefix sum operations *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. + * The value of 0 is applied as the initial value, and is assigned to + * @p output in thread0. * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse + * @par + * - @identityzero + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an exclusive prefix sum of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -333,35 +357,42 @@ public: * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 1, 1, ..., 1. The corresponding output @p thread_data in those + * threads will be 0, 1, ..., 127. + * + * @param[in] input + * Calling thread's input item * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + __device__ __forceinline__ void ExclusiveSum(T input, T &output) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse - * - * \par Snippet + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. + * The value of 0 is applied as the initial value, and is assigned to + * @p output in thread0. Also provides every thread + * with the block-wide @p block_aggregate of all inputs. + * + * @par + * - @identityzero + * - @rowmajor + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates an exclusive prefix sum of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -380,43 +411,56 @@ public: * int block_aggregate; * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 1, 1, ..., 1. The corresponding output @p thread_data in those + * threads will be 0, 1, ..., 127. Furthermore the value @p 128 will + * be stored in @p block_aggregate for all threads. + * + * @param[in] input + * Calling thread's input item * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. + * Instead of using 0 as the block-wide prefix, the call-back functor + * @p block_prefix_callback_op is invoked by the first warp in the block, + * and the value returned by lane0 in that warp is used + * as the "seed" value that logically prefixes the thread block's scan inputs. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. + * + * @par + * - @identityzero + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter + * @p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, + * however only the return value from lane0 is applied + * as the block-wide prefix. Can be stateful. + * - @rowmajor + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -464,19 +508,30 @@ public: * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * @endcode + * @par + * Suppose the input @p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 0, 1, ..., 127. * The output for the second segment will be 128, 129, ..., 255. * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + __device__ __forceinline__ void ExclusiveSum(T input, + T &output, + BlockPrefixCallbackOp &block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } @@ -484,26 +539,28 @@ public: //@} end member group /******************************************************************//** - * \name Exclusive prefix sum operations (multiple data per thread) + * @name Exclusive prefix sum operations (multiple data per thread) *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of consecutive + * input elements. The value of 0 is applied as the initial value, and is + * assigned to @p output[0] in thread0. + * + * @par + * - @identityzero + * - @blocked + * - @granularity + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -521,39 +578,50 @@ public: * // Collectively compute the block-wide exclusive prefix sum * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The corresponding output + * @p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) */ template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD]) { T initial_value{}; ExclusiveScan(input, output, initial_value, cub::Sum()); } - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of consecutive + * input elements. The value of 0 is applied as the initial value, and is + * assigned to @p output[0] in thread0. Also provides + * every thread with the block-wide @p block_aggregate of all inputs. + * + * @par + * - @identityzero + * - @blocked + * - @granularity + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates an exclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -572,19 +640,30 @@ public: * int block_aggregate; * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output @p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @param[in] input + * Calling thread's input items * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @param[out] output + * Calling thread's output items (may be aliased to \p input) + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + T &block_aggregate) { // Reduce consecutive thread items in registers T initial_value{}; @@ -592,28 +671,38 @@ public: ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); } - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet + * @brief Computes an exclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of consecutive + * input elements. Instead of using 0 as the block-wide prefix, the + * call-back functor @p block_prefix_callback_op is invoked by the first warp + * in the block, and the value returned by lane0 in that + * warp is used as the "seed" value that logically prefixes the thread block's + * scan inputs. Also provides every thread with the block-wide + * @p block_aggregate of all inputs. + * + * @par + * - @identityzero + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). + * The functor's input parameter @p block_aggregate is the same value also returned + * by the scan operation. The functor will be invoked by the first warp of threads in + * the block, however only the return value from + * lane0 is applied as the block-wide prefix. + * Can be stateful. + * - @blocked + * - @granularity + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code + * of 512 integer items that are partitioned in a [blocked + * arrangement](index.html#sec5sec3) across 128 threads where each thread owns 4 + * consecutive items. + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -671,22 +760,34 @@ public: * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. - * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be + * 0, 1, 2, 3, ..., 510, 511. The output for the second segment + * will be 512, 513, 514, 515, ..., 1022, 1023. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockPrefixCallbackOp &block_prefix_callback_op) { ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } @@ -695,24 +796,24 @@ public: //@} end member group // Exclusive prefix sums /******************************************************************//** - * \name Exclusive prefix scan operations + * @name Exclusive prefix scan operations *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * @brief Computes an exclusive block-wide prefix scan using the specified binary + * @p scan_op functor. Each thread contributes one input element. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -730,37 +831,51 @@ public: * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 0, -1, 2, -3, ..., 126, -127. The corresponding output @p thread_data + * in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[in] initial_value + * Initial value to seed the exclusive scan (and is assigned to @p output[0] in + * thread0) + * + * @param[in] scan_op + * Binary scan functor */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an exclusive block-wide prefix scan using the specified + * binary @p scan_op functor. Each thread contributes one input element. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -779,45 +894,66 @@ public: * int block_aggregate; * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 0, -1, 2, -3, ..., 126, -127. The corresponding output + * @p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value @p 126 will be stored in @p block_aggregate for all threads. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan (and is assigned to + * @p output[0] in thread0) * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[in] scan_op + * Binary scan functor + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes one input element. + * the call-back functor @p block_prefix_callback_op is invoked by the first warp + * in the block, and the value returned by lane0 in that warp + * is used as the "seed" value that logically prefixes the thread block's scan + * inputs. Also provides every thread with the block-wide @p block_aggregate of + * all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter @p block_aggregate + * is the same value also returned by the scan operation. The functor will be invoked by the + * first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -865,23 +1001,39 @@ public: * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. - * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be + * INT_MIN, 0, 0, 2, ..., 124, 126. The output for the second segment + * will be 126, 128, 128, 130, ..., 252, 254. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a block-wide + * prefix to be applied to the logical input sequence. */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void ExclusiveScan(T input, + T &output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); } @@ -889,26 +1041,27 @@ public: //@} end member group // Inclusive prefix sums /******************************************************************//** - * \name Exclusive prefix scan operations (multiple data per thread) + * @name Exclusive prefix scan operations (multiple data per thread) *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. * - * \par + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code + * @par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer + * items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -926,24 +1079,38 @@ public: * // Collectively compute the block-wide exclusive prefix max scan * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan (and is assigned to @p output[0] in + * thread0) + * + * @param[in] scan_op + * Binary scan functor */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor + template + __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + T initial_value, + ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); @@ -955,22 +1122,24 @@ public: internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. Also provides every thread + * with the block-wide @p block_aggregate of all inputs. * - * \par + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -989,24 +1158,43 @@ public: * int block_aggregate; * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output @p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param input + * [in] Calling thread's input items + * + * @param output + * [out] Calling thread's output items (may be aliased to @p input) + * + * @param initial_value + * [in] Initial value to seed the exclusive scan + * (and is assigned to @p output[0] in thread0) + * + * @param scan_op + * [in] Binary scan functor + * + * @param block_aggregate + * [out] block-wide aggregate reduction of input items */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + template + __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + T initial_value, + ScanOp scan_op, + T &block_aggregate) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); @@ -1018,27 +1206,33 @@ public: internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. The call-back functor + * @p block_prefix_callback_op is invoked by the first warp in the block, + * and the value returned by lane0 in that warp is used as + * the "seed" value that logically prefixes the thread block's scan inputs. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter @p block_aggregate + * is the same value also returned by the scan operation. The functor will be invoked by the + * first warp of threads in the block, however only the return value from * lane0 is applied as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an exclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -1095,25 +1289,43 @@ public: * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. - * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be + * INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be + * 510, 512, 512, 514, 514, 516, ..., 1020, 1022. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param input + * [in] Calling thread's input items + * + * @param output + * [out] Calling thread's output items (may be aliased to @p input) + * + * @param scan_op + * [in] Binary scan functor + * + * @param block_prefix_callback_op + * [in-out] [warp0 only] Call-back functor for + * specifying a block-wide prefix to be applied to the logical input sequence. */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { // Reduce consecutive thread items in registers T thread_prefix = internal::ThreadReduce(input, scan_op); @@ -1130,77 +1342,113 @@ public: #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, single datum per thread) + * @name Exclusive prefix scan operations (no initial value, single datum per thread) *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * one input element. With no initial value, the output computed + * for thread0 is undefined. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[in] scan_op + * Binary scan functor */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + __device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * one input element. Also provides every thread with the block-wide + * @p block_aggregate of all inputs. With no initial value, the output + * computed for thread0 is undefined. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input item * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[out] output + * Calling thread's output item (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate) { InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); } //@} end member group /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + * @name Exclusive prefix scan operations (no initial value, multiple data per thread) *********************************************************************/ //@{ - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. With no initial value, the + * output computed for thread0 is undefined. * - * \par + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + template + __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); @@ -1212,27 +1460,44 @@ public: internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); } - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - * - * \par + * @brief Computes an exclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. Also provides every thread + * with the block-wide @p block_aggregate of all inputs. + * With no initial value, the output computed for + * thread0 is undefined. + * + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + template + __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op, + T &block_aggregate) { // Reduce consecutive thread items in registers T thread_partial = internal::ThreadReduce(input, scan_op); @@ -1249,23 +1514,23 @@ public: #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans /******************************************************************//** - * \name Inclusive prefix sum operations + * @name Inclusive prefix sum operations *********************************************************************/ //@{ - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. * - * \par - * - \rowmajor - * - \smemreuse + * @par + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix sum of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1283,32 +1548,37 @@ public: * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 1, 1, ..., 1. The corresponding output @p thread_data in those threads + * will be 1, 2, ..., 128. * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + __device__ __forceinline__ void InclusiveSum(T input, T &output) { InclusiveScan(input, output, cub::Sum()); } - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. * - * \par - * - \rowmajor - * - \smemreuse + * @par + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix sum of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1327,41 +1597,54 @@ public: * int block_aggregate; * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 1, 1, ..., 1. The corresponding output @p thread_data in those + * threads will be 1, 2, ..., 128. Furthermore the value @p 128 will + * be stored in @p block_aggregate for all threads. + * + * @param[in] input + * Calling thread's input item * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate) { InclusiveScan(input, output, cub::Sum(), block_aggregate); } - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes one input element. + * Instead of using 0 as the block-wide prefix, the call-back functor + * @p block_prefix_callback_op is invoked by the first warp in the block, + * and the value returned by lane0 in that warp is + * used as the "seed" value that logically prefixes the thread block's + * scan inputs. Also provides every thread with the block-wide + * @p block_aggregate of all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter + * @p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, + * however only the return value from lane0 is applied + * as the block-wide prefix. Can be stateful. + * - @rowmajor + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * prefix functor to maintain a running total between block-wide scans. + * Each tile consists of 128 integer items that are partitioned across 128 threads. + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -1409,19 +1692,30 @@ public: * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * @endcode + * @par + * Suppose the input @p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... * The corresponding output for the first segment will be 1, 2, ..., 128. * The output for the second segment will be 129, 130, ..., 256. * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + __device__ __forceinline__ void InclusiveSum(T input, + T &output, + BlockPrefixCallbackOp &block_prefix_callback_op) { InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); } @@ -1429,25 +1723,26 @@ public: //@} end member group /******************************************************************//** - * \name Inclusive prefix sum operations (multiple data per thread) + * @name Inclusive prefix sum operations (multiple data per thread) *********************************************************************/ //@{ - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of + * consecutive input elements. * - * \par - * - \blocked - * - \granularity - * - \smemreuse + * @par + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1465,17 +1760,25 @@ public: * // Collectively compute the block-wide inclusive prefix sum * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The corresponding output + * @p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @param[in] input + * Calling thread's input items * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @param[out] output + * Calling thread's output items (may be aliased to @p input) */ template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD]) { if (ITEMS_PER_THREAD == 1) { @@ -1495,21 +1798,23 @@ public: } } - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of consecutive + * input elements. Also provides every thread with the block-wide + * @p block_aggregate of all inputs. * - * \par - * - \blocked - * - \granularity - * - \smemreuse + * @par + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix sum of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1528,22 +1833,34 @@ public: * int block_aggregate; * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be + * corresponding output @p thread_data in those threads will be * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + T &block_aggregate) { if (ITEMS_PER_THREAD == 1) { @@ -1563,27 +1880,36 @@ public: } } - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet + * @brief Computes an inclusive block-wide prefix scan using addition (+) + * as the scan operator. Each thread contributes an array of consecutive + * input elements. Instead of using 0 as the block-wide prefix, the + * call-back functor @p block_prefix_callback_op is invoked by the first + * warp in the block, and the value returned by lane0 + * in that warp is used as the "seed" value that logically prefixes the + * thread block's scan inputs. Also provides every thread with the + * block-wide @p block_aggregate of all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter + * @p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, + * however only the return value from lane0 is applied + * as the block-wide prefix. Can be stateful. + * - @blocked + * - @granularity + * - @smemreuse + * + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix sum over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code + * of 512 integer items that are partitioned in a + * [blocked arrangement](index.html#sec5sec3) across 128 threads where each thread + * owns 4 consecutive items. + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -1640,22 +1966,34 @@ public: * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. - * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be + * 1, 2, 3, 4, ..., 511, 512. The output for the second segment will be + * 513, 514, 515, 516, ..., 1023, 1024. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + BlockPrefixCallbackOp &block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { @@ -1678,24 +2016,25 @@ public: //@} end member group /******************************************************************//** - * \name Inclusive prefix scan operations + * @name Inclusive prefix scan operations *********************************************************************/ //@{ - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * one input element. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that * are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1713,36 +2052,47 @@ public: * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 0, -1, 2, -3, ..., 126, -127. The corresponding output @p thread_data + * in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param input + * [in] Calling thread's input item * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param output + * [out] Calling thread's output item (may be aliased to @p input) + * + * @param scan_op + * [in] Binary scan functor */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + __device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); } - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * one input element. Also provides every thread with the block-wide + * @p block_aggregate of all inputs. * - * \par + * @par * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code + * - @rowmajor + * - @smemreuse + * + * @par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 + * integer items that are partitioned across 128 threads. + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1761,44 +2111,63 @@ public: * int block_aggregate; * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * 0, -1, 2, -3, ..., 126, -127. The corresponding output @p thread_data + * in those threads will be 0, 0, 2, 2, ..., 126, 126. Furthermore the value + * @p 126 will be stored in @p block_aggregate for all threads. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + __device__ __forceinline__ void + InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); } - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * one input element. The call-back functor @p block_prefix_callback_op + * is invoked by the first warp in the block, and the value returned by + * lane0 in that warp is used as the "seed" value + * that logically prefixes the thread block's scan inputs. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter + * @p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, + * however only the return value from lane0 is applied + * as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse + * - @rowmajor + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -1846,23 +2215,39 @@ public: * // Store scanned items to output segment * d_data[block_offset] = thread_data; * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. - * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be + * 0, 0, 2, 2, ..., 126, 126. The output for the second segment + * will be 128, 128, 130, 130, ..., 254, 254. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void InclusiveScan(T input, + T &output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); } @@ -1870,26 +2255,27 @@ public: //@} end member group /******************************************************************//** - * \name Inclusive prefix scan operations (multiple data per thread) + * @name Inclusive prefix scan operations (multiple data per thread) *********************************************************************/ //@{ - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * an array of consecutive input elements. * - * \par + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1907,21 +2293,33 @@ public: * // Collectively compute the block-wide inclusive prefix max scan * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output @p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor + template + __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op) { if (ITEMS_PER_THREAD == 1) { @@ -1940,22 +2338,24 @@ public: } } - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes + * an array of consecutive input elements. Also provides every thread + * with the block-wide @p block_aggregate of all inputs. * - * \par + * @par * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -1972,27 +2372,41 @@ public: * * // Collectively compute the block-wide inclusive prefix max scan * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), + * block_aggregate); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of input @p thread_data across the block of threads is * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be + * The corresponding output @p thread_data in those threads will be * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Calling thread's input items * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[out] block_aggregate + * block-wide aggregate reduction of input items */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + template + __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op, + T &block_aggregate) { if (ITEMS_PER_THREAD == 1) { @@ -2011,27 +2425,34 @@ public: } } - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. + * @brief Computes an inclusive block-wide prefix scan using the + * specified binary @p scan_op functor. Each thread contributes an + * array of consecutive input elements. The call-back functor + * @p block_prefix_callback_op is invoked by the first warp in the block, + * and the value returned by lane0 in that warp is used + * as the "seed" value that logically prefixes the thread block's scan inputs. + * Also provides every thread with the block-wide @p block_aggregate of all inputs. + * + * @par + * - The @p block_prefix_callback_op functor must implement a member function + * T operator()(T block_aggregate). The functor's input parameter + * @p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, + * however only the return value from lane0 is applied + * as the block-wide prefix. Can be stateful. * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse + * - @blocked + * - @granularity + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates a single thread block that progressively * computes an inclusive prefix max scan over multiple "tiles" of input using a * prefix functor to maintain a running total between block-wide scans. Each tile consists * of 128 integer items that are partitioned across 128 threads. - * \par - * \code + * @par + * @code * #include // or equivalently * * // A stateful callback functor that maintains a running prefix to be applied @@ -2088,25 +2509,42 @@ public: * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); * CTA_SYNC(); * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. - * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + * @endcode + * @par + * Suppose the input @p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be + * 0, 0, 2, 2, 4, 4, ..., 510, 510. The output for the second + * segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam ScanOp + * [inferred] Binary scan functor type having member + * T operator()(const T &a, const T &b) + * + * @tparam BlockPrefixCallbackOp + * [inferred] Call-back functor type having member + * T operator()(T block_aggregate) + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan functor + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a + * block-wide prefix to be applied to the logical input sequence. */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + template + __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD], + T (&output)[ITEMS_PER_THREAD], + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { if (ITEMS_PER_THREAD == 1) { diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh index 4118344df7a..3a15499370e 100644 --- a/cub/cub/block/block_shuffle.cuh +++ b/cub/cub/block/block_shuffle.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + * @file + * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling + * data partitioned across a CUDA thread block. */ #pragma once @@ -47,16 +48,27 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. - * \ingroup BlockModule + * @brief The BlockShuffle class provides [collective](index.html#sec0) + * methods for shuffling data partitioned across a CUDA thread block. * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. + * @ingroup BlockModule * - * \par Overview + * @tparam T + * The data type to be exchanged. + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * It is commonplace for blocks of threads to rearrange data items between * threads. The BlockShuffle abstraction allows threads to efficiently shift items * either (a) up to their successor or (b) down to their predecessor. @@ -127,12 +139,13 @@ private: public: /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of + * shared memory as temporary storage. */ __device__ __forceinline__ BlockShuffle() : @@ -140,35 +153,46 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation + * as temporary storage. + * + * @param[in] temp_storage + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockShuffle( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockShuffle(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Shuffle movement + * @name Shuffle movement *********************************************************************/ //@{ - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. + * @brief Each threadi obtains the @p input provided by + * threadi+distance. + * The offset @p distance may be negative. * - * \par - * - \smemreuse + * @par + * - @smemreuse + * + * @param[in] input + * The input item from the calling thread (threadi) + * + * @param[out] output + * The @p input item from the successor (or predecessor) thread + * threadi+distance (may be aliased to @p input). + * This value is only updated for for threadi when + * 0 <= (i + \p distance) < BLOCK_THREADS-1 + * + * @param[in] distance + * Offset distance (may be negative) */ - __device__ __forceinline__ void Offset( - T input, ///< [in] The input item from the calling thread (threadi) - T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 - int distance = 1) ///< [in] Offset distance (may be negative) + __device__ __forceinline__ void Offset(T input, T &output, int distance = 1) { temp_storage[linear_tid] = input; @@ -181,17 +205,26 @@ public: } } - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. + * @brief Each threadi obtains the @p input + * provided by threadi+distance. * - * \par - * - \smemreuse + * @par + * - @smemreuse + * + * @param[in] input + * The calling thread's input item + * + * @param[out] output + * The @p input item from thread + * thread(i+distance>)%BLOCK_THREADS + * (may be aliased to @p input). This value is not updated for + * threadBLOCK_THREADS-1 + * + * @param[in] distance + * Offset distance (0 < @p distance < BLOCK_THREADS) */ - __device__ __forceinline__ void Rotate( - T input, ///< [in] The calling thread's input item - T& output, ///< [out] The \p input item from thread thread(i+distance>)%BLOCK_THREADS (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 - unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) + __device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance = 1) { temp_storage[linear_tid] = input; @@ -204,19 +237,25 @@ public: output = temp_storage[offset]; } - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item + * @brief The thread block rotates its + * [blocked arrangement](index.html#sec5sec3) of + * @p input items, shifting it up by one item. * - * \par - * - \blocked - * - \granularity - * - \smemreuse + * @par + * - @blocked + * - @granularity + * - @smemreuse + * + * @param[in] input + * The calling thread's input items + * + * @param[out] prev + * The corresponding predecessor items (may be aliased to @p input). + * The item @p prev[0] is not updated for thread0. */ template - __device__ __forceinline__ void Up( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + __device__ __forceinline__ void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD]) { temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; @@ -249,19 +288,25 @@ public: block_suffix = temp_storage[BLOCK_THREADS - 1]; } - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item + * @brief The thread block rotates its + * [blocked arrangement](index.html#sec5sec3) of + * @p input items, shifting it down by one item. * - * \par - * - \blocked - * - \granularity - * - \smemreuse + * @par + * - @blocked + * - @granularity + * - @smemreuse + * + * @param[in] input + * The calling thread's input items + * + * @param[out] prev + * The corresponding predecessor items (may be aliased to @p input). + * The value @p prev[0] is not updated for threadBLOCK_THREADS-1. */ template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD]) { temp_storage[linear_tid] = input[0]; @@ -275,20 +320,31 @@ public: prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1]; } - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. + * @brief The thread block rotates its + * [blocked arrangement](index.html#sec5sec3) of input items, + * shifting it down by one item. All threads receive @p input[0] + * provided by thread0. * - * \par - * - \blocked - * - \granularity - * - \smemreuse + * @par + * - @blocked + * - @granularity + * - @smemreuse + * + * @param[in] input + * The calling thread's input items + * + * @param[out] prev + * The corresponding predecessor items (may be aliased to @p input). + * The value @p prev[0] is not updated for threadBLOCK_THREADS-1. + * + * @param[out] block_prefix + * The item @p input[0] from thread0, provided to all threads */ template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. - T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads + __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD], + T (&prev)[ITEMS_PER_THREAD], + T &block_prefix) { Down(input, prev); block_prefix = temp_storage[0]; diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh index 2c7c93347ea..2ed823ed8be 100644 --- a/cub/cub/block/block_store.cuh +++ b/cub/cub/block/block_store.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Operations for writing linear segments of data from the CUDA thread block */ @@ -51,33 +51,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIo + * @addtogroup UtilIo * @{ */ /******************************************************************//** - * \name Blocked arrangement I/O (direct) + * @name Blocked arrangement I/O (direct) *********************************************************************/ //@{ /** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * @brief Store a blocked arrangement of items across a thread block into a linear segment of items. * - * \blocked + * @blocked * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + * @tparam T + * [inferred] The data type to store. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +template +__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); @@ -89,25 +100,40 @@ __device__ __forceinline__ void StoreDirectBlocked( } } - /** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * @brief Store a blocked arrangement of items across a + * thread block into a linear segment of items, guarded by range + * + * @blocked + * + * @tparam T + * [inferred] The data type to store. * - * \blocked + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write +template +__device__ __forceinline__ void StoreDirectBlocked(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); @@ -122,31 +148,42 @@ __device__ __forceinline__ void StoreDirectBlocked( } } - /** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * @brief Store a blocked arrangement of items across a + * thread block into a linear segment of items. + * + * @blocked + * + * The output offset (@p block_ptr + @p block_offset) must be quad-item aligned, + * which is the default starting offset returned by @p cudaMalloc() * - * \blocked + * @par + * The following conditions will prevent vectorization and storing will + * fall back to cub::BLOCK_STORE_DIRECT: + * - @p ITEMS_PER_THREAD is odd + * - The data type @p T is not a built-in primitive or CUDA vector type + * (e.g., \p short, \p int2, \p double, \p float2, etc.) * - * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, - * which is the default starting offset returned by \p cudaMalloc() + * @tparam T + * [inferred] The data type to store. * - * \par - * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) * + * @param[in] block_ptr + * Input pointer for storing from + * + * @param[in] items + * Data to store */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void StoreDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for storing from - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +template +__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid, + T *block_ptr, + T (&items)[ITEMS_PER_THREAD]) { enum { @@ -186,30 +223,42 @@ __device__ __forceinline__ void StoreDirectBlockedVectorized( //@} end member group /******************************************************************//** - * \name Striped arrangement I/O (direct) + * @name Striped arrangement I/O (direct) *********************************************************************/ //@{ - /** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * @brief Store a striped arrangement of data across the thread block into a + * linear segment of items. + * + * @striped + * + * @tparam BLOCK_THREADS + * The thread block size in threads + * + * @tparam T + * [inferred] The data type to store. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output @iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) * - * \striped + * @param[in] block_itr + * The thread block's base output iterator for storing to * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + * @param[in] items + * Data to store */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +template +__device__ __forceinline__ void StoreDirectStriped(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { OutputIteratorT thread_itr = block_itr + linear_tid; @@ -221,27 +270,42 @@ __device__ __forceinline__ void StoreDirectStriped( } } - /** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * @brief Store a striped arrangement of data across the thread block into + * a linear segment of items, guarded by range + * + * @striped + * + * @tparam BLOCK_THREADS + * The thread block size in threads + * + * @tparam T + * [inferred] The data type to store. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to * - * \striped + * @param[in] items + * Data to store * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + * @param[in] valid_items + * Number of valid items to write */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write +template +__device__ __forceinline__ void StoreDirectStriped(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { OutputIteratorT thread_itr = block_itr + linear_tid; @@ -260,31 +324,42 @@ __device__ __forceinline__ void StoreDirectStriped( //@} end member group /******************************************************************//** - * \name Warp-striped arrangement I/O (direct) + * @name Warp-striped arrangement I/O (direct) *********************************************************************/ //@{ - /** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * @brief Store a warp-striped arrangement of data across the + * thread block into a linear segment of items. * - * \warpstriped + * @warpstriped * - * \par Usage Considerations + * @par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + * @tparam T + * [inferred] The data type to store. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[out] items + * Data to load */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +template +__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; @@ -300,28 +375,42 @@ __device__ __forceinline__ void StoreDirectWarpStriped( } } - /** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * @brief Store a warp-striped arrangement of data across the thread block into a + * linear segment of items, guarded by range * - * \warpstriped + * @warpstriped * - * \par Usage Considerations + * @par Usage Considerations * The number of threads in the thread block must be a multiple of the architecture's warp size. * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. + * @tparam T + * [inferred] The data type to store. + * + * @tparam ITEMS_PER_THREAD + * [inferred] The number of consecutive items partitioned onto each thread. + * + * @tparam OutputIteratorT + * [inferred] The random-access iterator type for output \iterator. + * + * @param[in] linear_tid + * A suitable 1D thread-identifier for the calling thread + * (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write +template +__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid, + OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; @@ -352,59 +441,62 @@ __device__ __forceinline__ void StoreDirectWarpStriped( //----------------------------------------------------------------------------- /** - * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + * @brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a + * blocked arrangement of items across a CUDA thread block to a linear segment of memory. */ enum BlockStoreAlgorithm { /** - * \par Overview + * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ BLOCK_STORE_DIRECT, /** - * \par Overview + * @par Overview * A [striped arrangement](index.html#sec5sec3) of data is written * directly to memory. * - * \par Performance Considerations + * @par Performance Considerations * The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. */ BLOCK_STORE_STRIPED, /** - * \par Overview + * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written directly * to memory using CUDA's built-in vectorized stores as a coalescing optimization. * For example, st.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. + * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the * access stride between threads (i.e., the number items per thread) exceeds the * maximum vector store width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p OutputIteratorT is not a simple pointer type + * - @p ITEMS_PER_THREAD is odd + * - The @p OutputIteratorT is not a simple pointer type * - The block output offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * - The data type @p T is not a built-in primitive or CUDA vector type + * (e.g., @p short, @p int2, @p double, @p float2, etc.) */ BLOCK_STORE_VECTORIZE, /** - * \par Overview + * @par Overview * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). + * transposed and then efficiently written to memory as a + * [striped arrangement](index.html#sec5sec3). * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the @@ -413,15 +505,15 @@ enum BlockStoreAlgorithm BLOCK_STORE_TRANSPOSE, /** - * \par Overview + * @par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [warp-striped arrangement](index.html#sec5sec3) * - * \par Usage Considerations + * @par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the @@ -430,17 +522,17 @@ enum BlockStoreAlgorithm BLOCK_STORE_WARP_TRANSPOSE, /** - * \par Overview + * @par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [warp-striped arrangement](index.html#sec5sec3) * To reduce the shared memory requirement, only one warp's worth of shared * memory is provisioned and is subsequently time-sliced among warps. * - * \par Usage Considerations + * @par Usage Considerations * - BLOCK_THREADS must be a multiple of WARP_THREADS * - * \par Performance Considerations + * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. * - Provisions less shared memory temporary storage, but incurs larger @@ -451,19 +543,38 @@ enum BlockStoreAlgorithm /** - * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam T The type of data to be written. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam LEGACY_PTX_ARCH [optional] Unused. - * - * \par Overview + * @brief The BlockStore class provides [collective](index.html#sec0) data movement + * methods for writing a [blocked arrangement](index.html#sec5sec3) of items + * partitioned across a CUDA thread block to a linear segment of memory. + * ![](block_store_logo.png) + * + * @ingroup BlockModule + * + * @ingroup UtilIo + * + * @tparam T + * The type of data to be written. + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ITEMS_PER_THREAD + * The number of consecutive items partitioned onto each thread. + * + * @tparam ALGORITHM + * [optional] cub::BlockStoreAlgorithm tuning policy enumeration. + * default: cub::BLOCK_STORE_DIRECT. + * + * @tparam BLOCK_DIM_Y + * [optional] The thread block length in threads along the Y dimension (default: 1) + * + * @tparam BLOCK_DIM_Z + * [optional] The thread block length in threads along the Z dimension (default: 1) + * + * @tparam LEGACY_PTX_ARCH + * [optional] Unused. + * + * @par Overview * - The BlockStore class provides a single data movement abstraction that can be specialized * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different * performance policies for different architectures, data types, granularity sizes, etc. @@ -487,16 +598,16 @@ enum BlockStoreAlgorithm * memory is provisioned and is subsequently time-sliced among warps. [More...](\ref cub::BlockStoreAlgorithm) * - \rowmajor * - * \par A Simple Example + * @par A Simple Example * \blockcollective{BlockStore} - * \par + * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -514,13 +625,13 @@ enum BlockStoreAlgorithm * // Store items to linear memory * BlockStore(temp_storage).Store(d_data, thread_data); * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of @p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... * - * \par Re-using dynamically allocating shared memory + * @par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: @@ -580,21 +691,38 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory + /** + * @brief Store items into a linear segment of memory + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } @@ -621,21 +749,38 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory + /** + * @brief Store items into a linear segment of memory + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } @@ -662,29 +807,54 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) - __device__ __forceinline__ void Store( - T *block_ptr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + /** + * @brief Store items into a linear segment of memory, + * specialized for native pointer types (attempts vectorization) + * + * @param[in] block_ptr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ + __device__ __forceinline__ void Store(T *block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } - /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + /** + * @brief Store items into a linear segment of memory, + * specialized for opaque input iterators (skips vectorization) + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } @@ -725,26 +895,47 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory + /** + * @brief Store items into a linear segment of memory + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToStriped(items); StoreDirectStriped(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { BlockExchange(temp_storage).BlockedToStriped(items); if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + { + // Move through volatile smem as a workaround to prevent RF spilling on + // subsequent loads + temp_storage.valid_items = valid_items; + } CTA_SYNC(); StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } @@ -793,26 +984,47 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory + /** + * @brief Store items into a linear segment of memory + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + { + // Move through volatile smem as a workaround to prevent RF spilling on + // subsequent loads + temp_storage.valid_items = valid_items; + } CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } @@ -861,26 +1073,47 @@ private: linear_tid(linear_tid) {} - /// Store items into a linear segment of memory + /** + * @brief Store items into a linear segment of memory + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD]) { BlockExchange(temp_storage).BlockedToWarpStriped(items); StoreDirectWarpStriped(linear_tid, block_itr, items); } - /// Store items into a linear segment of memory, guarded by range + /** + * @brief Store items into a linear segment of memory, guarded by range + * + * @param[in] block_itr + * The thread block's base output iterator for storing to + * + * @param[in] items + * Data to store + * + * @param[in] valid_items + * Number of valid items to write + */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { BlockExchange(temp_storage).BlockedToWarpStriped(items); if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + { + // Move through volatile smem as a workaround to prevent RF spilling on + // subsequent loads + temp_storage.valid_items = valid_items; + } CTA_SYNC(); StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } @@ -924,17 +1157,17 @@ private: public: - /// \smemstorage{BlockStore} + /// @smemstorage{BlockStore} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** - * \name Collective constructors + * @name Collective constructors *********************************************************************/ //@{ /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + * @brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockStore() : @@ -942,40 +1175,39 @@ public: linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. + * @brief Collective constructor using the specified memory allocation as temporary storage. + * + * @param temp_storage[in] + * Reference to memory allocation having layout type TempStorage */ - __device__ __forceinline__ BlockStore( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + __device__ __forceinline__ BlockStore(TempStorage &temp_storage) + : temp_storage(temp_storage.Alias()) + , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** - * \name Data movement + * @name Data movement *********************************************************************/ //@{ - /** - * \brief Store items into a linear segment of memory. + * @brief Store items into a linear segment of memory. * - * \par - * - \blocked - * - \smemreuse + * @par + * - @blocked + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) @@ -994,36 +1226,40 @@ public: * int thread_data[4]; * BlockStore(temp_storage).Store(d_data, thread_data); * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is + * @endcode + * @par + * Suppose the set of @p thread_data across the block of threads is * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... + * + * @param block_itr[out] + * The thread block's base output iterator for storing to + * + * @param items[in] + * Data to store * */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [out] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } /** - * \brief Store items into a linear segment of memory, guarded by range. + * @brief Store items into a linear segment of memory, guarded by range. * - * \par - * - \blocked - * - \smemreuse + * @par + * - @blocked + * - @smemreuse * - * \par Snippet + * @par Snippet * The code snippet below illustrates the guarded storing of a "blocked" arrangement * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE, * meaning items are locally reordered among threads so that memory references will be * efficiently coalesced using a warp-striped access pattern. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items, ...) @@ -1042,19 +1278,26 @@ public: * int thread_data[4]; * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. - * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * @endcode + * @par + * Suppose the set of @p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and @p valid_items is @p 5. + * The output @p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with * only the first two threads being unmasked to store portions of valid data. * + * @param block_itr[out] + * The thread block's base output iterator for storing to + * + * @param items[in] + * Data to store + * + * @param valid_items[in] + * Number of valid items to write */ template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [out] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write + __device__ __forceinline__ void Store(OutputIteratorT block_itr, + T (&items)[ITEMS_PER_THREAD], + int valid_items) { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh index 367599b3dc2..3360ad0ed55 100644 --- a/cub/cub/block/specializations/block_histogram_atomic.cuh +++ b/cub/cub/block/specializations/block_histogram_atomic.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + * @file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide + * histograms from data samples partitioned across a CUDA thread block. */ #pragma once @@ -43,9 +44,9 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide + * histograms from data samples partitioned across a CUDA thread block. */ template struct BlockHistogramAtomic @@ -59,24 +60,26 @@ struct BlockHistogramAtomic TempStorage &temp_storage) {} - - /// Composite data onto an existing histogram - template < - typename T, - typename CounterT, - int ITEMS_PER_THREAD> - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + /** + * @brief Composite data onto an existing histogram + * + * @param[in] items + * Calling thread's input values to histogram + * + * @param[out] histogram + * Reference to shared/device-accessible memory histogram + */ + template + __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD], + CounterT histogram[BINS]) { - // Update histogram - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { - atomicAdd(histogram + items[i], 1); - } + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } } - }; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh index 4ae46fc9af4..38dc70315a0 100644 --- a/cub/cub/block/specializations/block_histogram_sort.cuh +++ b/cub/cub/block/specializations/block_histogram_sort.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + * @file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide + * histograms from data samples partitioned across a CUDA thread block. */ #pragma once @@ -47,19 +48,38 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - - /** - * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide + * histograms from data samples partitioned across a CUDA thread block. + * + * @tparam T + * Sample type + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam ITEMS_PER_THREAD + * The number of samples per thread + * + * @tparam BINS + * The number of bins into which histogram samples may fall + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective (unused) */ -template < - typename T, ///< Sample type - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int ITEMS_PER_THREAD, ///< The number of samples per thread - int BINS, ///< The number of bins into which histogram samples may fall - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective (unused) +template struct BlockHistogramSort { /// Constants @@ -156,13 +176,18 @@ struct BlockHistogramSort } }; - - // Composite data onto an existing histogram - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram + /** + * @brief Composite data onto an existing histogram + * + * @param[in] items + * Calling thread's input values to histogram + * + * @param[out] histogram + * Reference to shared/device-accessible memory histogram + */ + template + __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD], + CounterT histogram[BINS]) { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh index 98ab45c794d..f8db9326fdb 100644 --- a/cub/cub/block/specializations/block_reduce_raking.cuh +++ b/cub/cub/block/specializations/block_reduce_raking.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * @file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread + * block. Supports non-commutative reduction operators. */ #pragma once @@ -48,9 +49,9 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread + * block. Supports non-commutative reduction operators. * * Supports non-commutative binary reduction operators. Unlike commutative * reduction operators (e.g., addition), the application of a non-commutative @@ -61,13 +62,23 @@ CUB_NAMESPACE_BEGIN * Compared to the implementation of BlockReduceRakingCommutativeOnly (which * does not support non-commutative operators), this implementation requires a * few extra rounds of inter-thread communication. + * + * @tparam T + * Data type being reduced + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct BlockReduceRaking { /// Constants @@ -103,14 +114,15 @@ struct BlockReduceRaking }; - /// Shared memory storage layout type union _TempStorage { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - }; + /// Storage for warp-synchronous reduction + typename WarpReduce::TempStorage warp_storage; + /// Padded thread block raking grid + typename BlockRakingLayout::TempStorage raking_grid; + }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -129,14 +141,22 @@ struct BlockReduceRaking linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - + /** + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] partial + * [lane0 only] Warp-wide aggregate reduction of input items + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T RakingReduction( - ReductionOp reduction_op, ///< [in] Binary reduction operator - T *raking_segment, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) + __device__ __forceinline__ T RakingReduction(ReductionOp reduction_op, + T *raking_segment, + T partial, + int num_valid, + Int2Type /*iteration*/) { // Update partial if addend is in range if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) @@ -147,27 +167,42 @@ struct BlockReduceRaking return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); } + /** + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] partial + * [lane0 only] Warp-wide aggregate reduction of input items + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T RakingReduction( - ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator - T * /*raking_segment*/, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) + __device__ __forceinline__ T RakingReduction(ReductionOp /*reduction_op*/, + T * /*raking_segment*/, + T partial, + int /*num_valid*/, + Int2Type /*iteration*/) { return partial; } - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool IS_FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator + /** + * @brief Computes a thread block-wide reduction using the specified reduction operator. The + * first num_valid threads each contribute one reduction partial. The return value is + * only valid for thread0. + * + * @param[in] partial + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + * + * @param[in] reduction_op + * Binary reduction operator + */ + template + __device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op) { if (WARP_SYNCHRONOUS) { @@ -208,20 +243,24 @@ struct BlockReduceRaking return partial; } - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /** + * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. + * The first num_valid threads each contribute one reduction partial. The return value is + * only valid for thread0. + * + * @param[in] partial + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + __device__ __forceinline__ T Sum(T partial, int num_valid) { cub::Sum reduction_op; return Reduce(partial, num_valid, reduction_op); } - - - }; CUB_NAMESPACE_END diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh index f4178f31edd..34a4e3ff236 100644 --- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + * @file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across + * a CUDA thread block. Does not support non-commutative reduction operators. */ #pragma once @@ -48,16 +49,27 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction + * across a CUDA thread block. Does not support non-commutative reduction operators. Does not + * support block sizes that are not a multiple of the warp size. + * + * @tparam T + * Data type being reduced + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct BlockReduceRakingCommutativeOnly { /// Constants @@ -98,15 +110,18 @@ struct BlockReduceRakingCommutativeOnly /// Shared memory storage layout type union _TempStorage { - struct DefaultStorage - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - } default_storage; + struct DefaultStorage + { + /// Storage for warp-synchronous reduction + typename WarpReduce::TempStorage warp_storage; - typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block reduction - }; + /// Padded thread block raking grid + typename BlockRakingLayout::TempStorage raking_grid; + } default_storage; + /// Fall-back storage for non-commutative block reduction + typename FallBack::TempStorage fallback_storage; + }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -125,12 +140,19 @@ struct BlockReduceRakingCommutativeOnly linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /** + * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. + * The first num_valid threads each contribute one reduction partial. + * The return value is only valid for thread0. + * + * @param[in] partial + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + __device__ __forceinline__ T Sum(T partial, int num_valid) { if (USE_FALLBACK || !FULL_TILE) { @@ -159,15 +181,22 @@ struct BlockReduceRakingCommutativeOnly return partial; } - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator + /** + * @brief Computes a thread block-wide reduction using the specified reduction operator. + * The first num_valid threads each contribute one reduction partial. + * The return value is only valid for thread0. + * + * @param[in] partial + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + * + * @param[in] reduction_op + * Binary reduction operator + */ + template + __device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op) { if (USE_FALLBACK || !FULL_TILE) { diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh index 3b68a283bcb..b9653366b00 100644 --- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * @file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction + * across a CUDA thread block. Supports non-commutative reduction operators. */ #pragma once @@ -47,16 +48,25 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction + * across a CUDA thread block. Supports non-commutative reduction operators. + * @tparam T + * Data type being reduced + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct BlockReduceWarpReductions { /// Constants @@ -82,13 +92,17 @@ struct BlockReduceWarpReductions /// WarpReduce utility type typedef typename WarpReduce::InternalWarpReduce WarpReduce; - /// Shared memory storage layout type struct _TempStorage { - typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous reduction - T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous reduction - T block_prefix; ///< Shared prefix for the entire thread block + /// Buffer for warp-synchronous reduction + typename WarpReduce::TempStorage warp_reduce[WARPS]; + + /// Shared totals from each warp-synchronous reduction + T warp_aggregates[WARPS]; + + /// Shared prefix for the entire thread block + T block_prefix; }; /// Alias wrapper allowing storage to be unioned @@ -112,13 +126,21 @@ struct BlockReduceWarpReductions lane_id(LaneId()) {} - + /** + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] warp_aggregate + * [lane0 only] Warp-wide aggregate reduction of input items + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary reduction operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) + __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, + T warp_aggregate, + int num_valid, + Int2Type /*successor_warp*/) { if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) { @@ -128,25 +150,41 @@ struct BlockReduceWarpReductions return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); } + /** + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] warp_aggregate + * [lane0 only] Warp-wide aggregate reduction of input items + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) + __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp /*reduction_op*/, + T warp_aggregate, + int /*num_valid*/, + Int2Type /*successor_warp*/) { return warp_aggregate; } - - /// Returns block-wide aggregate in thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary reduction operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + /** + * @brief Returns block-wide aggregate in thread0. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] warp_aggregate + * [lane0 only] Warp-wide aggregate reduction of input items + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ + template + __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op, + T warp_aggregate, + int num_valid) { // Share lane aggregates if (lane_id == 0) @@ -166,12 +204,19 @@ struct BlockReduceWarpReductions return warp_aggregate; } - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /** + * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator. + * The first num_valid threads each contribute one reduction partial. The return value is + * only valid for thread0. + * + * @param[in] input + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + */ template - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + __device__ __forceinline__ T Sum(T input, int num_valid) { cub::Sum reduction_op; int warp_offset = (warp_id * LOGICAL_WARP_SIZE); @@ -189,15 +234,22 @@ struct BlockReduceWarpReductions return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator + /** + * @brief Computes a thread block-wide reduction using the specified reduction operator. + * The first num_valid threads each contribute one reduction partial. + * The return value is only valid for thread0. + * + * @param[in] input + * Calling thread's input partial reductions + * + * @param[in] num_valid + * Number of valid elements (may be less than BLOCK_THREADS) + * + * @param[in] reduction_op + * Binary reduction operator + */ + template + __device__ __forceinline__ T Reduce(T input, int num_valid, ReductionOp reduction_op) { int warp_offset = warp_id * LOGICAL_WARP_SIZE; int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh index b1aad44e04e..e3e57aa0107 100644 --- a/cub/cub/block/specializations/block_scan_raking.cuh +++ b/cub/cub/block/specializations/block_scan_raking.cuh @@ -26,10 +26,10 @@ * ******************************************************************************/ - /** - * \file - * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + * @file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a + * CUDA thread block. */ #pragma once @@ -51,17 +51,35 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN - /** - * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. + * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA + * thread block. + * + * @tparam T + * Data type being scanned + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam MEMOIZE + * Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the + * expense of higher register pressure + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being scanned - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct BlockScanRaking { //--------------------------------------------------------------------- @@ -97,9 +115,14 @@ struct BlockScanRaking /// Shared memory storage layout type struct _TempStorage { - typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - T block_aggregate; ///< Block aggregate + /// Buffer for warp-synchronous scan + typename WarpScan::TempStorage warp_scan; + + /// Padded thread block raking grid + typename BlockRakingLayout::TempStorage raking_grid; + + /// Block aggregate + T block_aggregate; }; @@ -121,13 +144,23 @@ struct BlockScanRaking // Utility methods //--------------------------------------------------------------------- - /// Templated reduction + /** + * @brief Templated reduction + * + * @param[in] raking_ptr + * Input array + * + * @param[in] scan_op + * Binary reduction operator + * + * @param[in] raking_partial + * Prefix to seed reduction with + */ template - __device__ __forceinline__ T GuardedReduce( - T* raking_ptr, ///< [in] Input array - ScanOp scan_op, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) + __device__ __forceinline__ T GuardedReduce(T *raking_ptr, + ScanOp scan_op, + T raking_partial, + Int2Type /*iteration*/) { if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) { @@ -138,39 +171,57 @@ struct BlockScanRaking return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); } - - /// Templated reduction (base case) + /** + * @brief Templated reduction (base case) + * + * @param[in] raking_ptr + * Input array + * + * @param[in] scan_op + * Binary reduction operator + * + * @param[in] raking_partial + * Prefix to seed reduction with + */ template - __device__ __forceinline__ T GuardedReduce( - T* /*raking_ptr*/, ///< [in] Input array - ScanOp /*scan_op*/, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) + __device__ __forceinline__ T GuardedReduce(T * /*raking_ptr*/, + ScanOp /*scan_op*/, + T raking_partial, + Int2Type /*iteration*/) { return raking_partial; } - - /// Templated copy + /** + * @brief Templated copy + * + * @param out + * [out] Out array + * + * @param in + * [in] Input array + */ template - __device__ __forceinline__ void CopySegment( - T* out, ///< [out] Out array - T* in, ///< [in] Input array - Int2Type /*iteration*/) + __device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type /*iteration*/) { out[ITERATION] = in[ITERATION]; CopySegment(out, in, Int2Type()); } - - /// Templated copy (base case) - __device__ __forceinline__ void CopySegment( - T* /*out*/, ///< [out] Out array - T* /*in*/, ///< [in] Input array - Int2Type /*iteration*/) + /** + * @brief Templated copy (base case) + * + * @param[out] out + * Out array + * + * @param[in] in + * Input array + */ + __device__ __forceinline__ void CopySegment(T * /*out*/, + T * /*in*/, + Int2Type /*iteration*/) {} - /// Performs upsweep raking reduction, returning the aggregate template __device__ __forceinline__ T Upsweep( @@ -248,12 +299,22 @@ struct BlockScanRaking // Exclusive scans //--------------------------------------------------------------------- - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. With no initial value, + * the output computed for thread0 is undefined. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] exclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { @@ -289,13 +350,25 @@ struct BlockScanRaking } } - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { @@ -331,14 +404,27 @@ struct BlockScanRaking } } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. With no initial value, + * the output computed for thread0 is undefined. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate) { if (WARP_SYNCHRONOUS) { @@ -382,15 +468,29 @@ struct BlockScanRaking } } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input items + * + * @param[out] output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void + ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate) { if (WARP_SYNCHRONOUS) { @@ -433,16 +533,32 @@ struct BlockScanRaking } } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. the call-back functor \p + * block_prefix_callback_op is invoked by the first warp in the block, and the value + * returned by lane0 in that warp is used as the "seed" value that + * logically prefixes the thread block's scan inputs. Also provides every thread with + * the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a thread + * block-wide prefix to be applied to all inputs. + */ + template + __device__ __forceinline__ void ExclusiveScan(T input, + T &output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { if (WARP_SYNCHRONOUS) { @@ -504,12 +620,21 @@ struct BlockScanRaking // Inclusive scans //--------------------------------------------------------------------- - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op) { if (WARP_SYNCHRONOUS) { @@ -545,14 +670,26 @@ struct BlockScanRaking } } - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void + InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate) { if (WARP_SYNCHRONOUS) { @@ -596,16 +733,32 @@ struct BlockScanRaking } } - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. the call-back functor \p + * block_prefix_callback_op is invoked by the first warp in the block, and the value + * returned by lane0 in that warp is used as the "seed" value that + * logically prefixes the thread block's scan inputs. Also provides every thread with + * the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a thread + * block-wide prefix to be applied to all inputs. + */ + template + __device__ __forceinline__ void InclusiveScan(T input, + T &output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { if (WARP_SYNCHRONOUS) { diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh index f48b6dec0a4..0eef68780d0 100644 --- a/cub/cub/block/specializations/block_scan_warp_scans.cuh +++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ @@ -48,14 +48,22 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA + * thread block. + * + * @tparam BLOCK_DIM_X + * The thread block length in threads along the X dimension + * + * @tparam BLOCK_DIM_Y + * The thread block length in threads along the Y dimension + * + * @tparam BLOCK_DIM_Z + * The thread block length in threads along the Z dimension + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct BlockScanWarpScans { //--------------------------------------------------------------------- @@ -85,9 +93,13 @@ struct BlockScanWarpScans struct __align__(32) _TempStorage { - T warp_aggregates[WARPS]; - typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T block_prefix; ///< Shared prefix for the entire thread block + T warp_aggregates[WARPS]; + + /// Buffer for warp-synchronous scans + typename WarpScanT::TempStorage warp_scan[WARPS]; + + /// Shared prefix for the entire thread block + T block_prefix; }; @@ -125,12 +137,21 @@ struct BlockScanWarpScans // Utility methods //--------------------------------------------------------------------- + /** + * @param[out] warp_prefix + * The calling thread's partial reduction + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) + __device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix, + ScanOp scan_op, + T &block_aggregate, + Int2Type /*addend_warp*/) { if (warp_id == WARP) warp_prefix = block_aggregate; @@ -141,21 +162,41 @@ struct BlockScanWarpScans ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); } + /** + * @param[out] warp_prefix + * The calling thread's partial reduction + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregat + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ApplyWarpAggregates( - T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction - ScanOp /*scan_op*/, ///< [in] Binary scan operator - T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) + __device__ __forceinline__ void ApplyWarpAggregates(T & /*warp_prefix*/, + ScanOp /*scan_op*/, + T & /*block_aggregate*/, + Int2Type /*addend_warp*/) {} - - /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + /** + * @brief Use the warp-wide aggregates to compute the calling warp's prefix. Also returns + * block-wide aggregate in all threads. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] warp_aggregate + * [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of + * input items + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op, + T warp_aggregate, + T &block_aggregate) { // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) @@ -187,14 +228,26 @@ struct BlockScanWarpScans return warp_prefix; } - - /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + /** + * @brief Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. + * Also returns block-wide aggregate in all threads. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] warp_aggregate + * [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of + * input items + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + * + * @param[in] initial_value + * Initial value to seed the exclusive scan + */ template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - const T &initial_value) ///< [in] Initial value to seed the exclusive scan + __device__ __forceinline__ T + ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value) { T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); @@ -210,39 +263,73 @@ struct BlockScanWarpScans // Exclusive scans //--------------------------------------------------------------------- - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. With no initial value, + * the output computed for thread0 is undefined. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] exclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op) { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. + * + * @param[in] input + * Calling thread's input items + * + * @param[out] exclusive_output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void + ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op) { T block_aggregate; ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. With no initial value, + * the output computed for thread0 is undefined. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] exclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void + ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate) { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; @@ -260,15 +347,32 @@ struct BlockScanWarpScans } } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input items + * + * @param[out] exclusive_output + * Calling thread's output items (may be aliased to \p input) + * + * @param[in] initial_value + * Initial value to seed the exclusive scan + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void ExclusiveScan(T input, + T &exclusive_output, + const T &initial_value, + ScanOp scan_op, + T &block_aggregate) { // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; @@ -283,16 +387,32 @@ struct BlockScanWarpScans exclusive_output = warp_prefix; } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + /** + * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. the call-back functor \p + * block_prefix_callback_op is invoked by the first warp in the block, and the value + * returned by lane0 in that warp is used as the "seed" value that + * logically prefixes the thread block's scan inputs. Also provides every thread with + * the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] exclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a thread + * block-wide prefix to be applied to all inputs. + */ + template + __device__ __forceinline__ void ExclusiveScan(T input, + T &exclusive_output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. T block_aggregate; @@ -327,25 +447,46 @@ struct BlockScanWarpScans // Inclusive scans //--------------------------------------------------------------------- - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op) { T block_aggregate; InclusiveScan(input, inclusive_output, scan_op, block_aggregate); } - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. Also provides every + * thread with the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] block_aggregate + * Threadblock-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + __device__ __forceinline__ void + InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate) { WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); @@ -359,16 +500,32 @@ struct BlockScanWarpScans } } - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + /** + * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p + * scan_op functor. Each thread contributes one input element. the call-back functor \p + * block_prefix_callback_op is invoked by the first warp in the block, and the value + * returned by lane0 in that warp is used as the "seed" value that + * logically prefixes the thread block's scan inputs. Also provides every thread with + * the block-wide \p block_aggregate of all inputs. + * + * @param[in] input + * Calling thread's input item + * + * @param[out] exclusive_output + * Calling thread's output item (may be aliased to \p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in-out] block_prefix_callback_op + * [warp0 only] Call-back functor for specifying a thread + * block-wide prefix to be applied to all inputs. + */ + template + __device__ __forceinline__ void InclusiveScan(T input, + T &exclusive_output, + ScanOp scan_op, + BlockPrefixCallbackOp &block_prefix_callback_op) { T block_aggregate; InclusiveScan(input, exclusive_output, scan_op, block_aggregate); diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh index ac6fd123142..b68c70d842d 100644 --- a/cub/cub/device/device_spmv.cuh +++ b/cub/cub/device/device_spmv.cuh @@ -28,8 +28,9 @@ ******************************************************************************/ /** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + * @file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector + * multiplication (SpMV). */ #pragma once @@ -54,10 +55,12 @@ CUB_NAMESPACE_BEGIN /** - * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). - * \ingroup SingleModule + * @brief DeviceSpmv provides device-wide parallel operations for performing + * sparse-matrix * dense-vector multiplication (SpMV). * - * \par Overview + * @ingroup SingleModule + * + * @par Overview * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) * performs the matrix-vector operation * y = A*x + y, @@ -67,29 +70,31 @@ CUB_NAMESPACE_BEGIN * (i.e., three arrays: values, row_offsets, and column_indices) * - x and y are dense vectors * - * \par Usage Considerations - * \cdp_class{DeviceSpmv} + * @par Usage Considerations + * @cdp_class{DeviceSpmv} * */ struct DeviceSpmv { /******************************************************************//** - * \name CSR matrix operations + * @name CSR matrix operations *********************************************************************/ //@{ /** - * \brief This function performs the matrix-vector operation y = A*x. + * @brief This function performs the matrix-vector operation + * y = A*x. * - * \par Snippet + * @par Snippet * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A * representing a 3x3 lattice (24 non-zeros). * - * \par - * \code + * @par + * @code * #include // or equivalently * - * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, + * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input + * vector x, * // and output vector y * int num_rows = 9; * int num_cols = 9; @@ -126,25 +131,63 @@ struct DeviceSpmv * * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] * - * \endcode + * @endcode + * + * @tparam ValueT + * [inferred] Matrix and vector value type (e.g., @p float, @p double, etc.) + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to @p temp_storage_bytes + * and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of @p d_temp_storage allocation + * + * @param[in] d_values + * Pointer to the array of @p num_nonzeros values of the corresponding nonzero elements + * of matrix A. + * + * @param[in] d_row_offsets + * Pointer to the array of @p m + 1 offsets demarcating the start of every row in + * @p d_column_indices and @p d_values (with the final entry being equal to @p num_nonzeros) + * + * @param[in] d_column_indices + * Pointer to the array of @p num_nonzeros column-indices of the corresponding nonzero + * elements of matrix A. (Indices are zero-valued.) + * + * @param[in] d_vector_x + * Pointer to the array of @p num_cols values corresponding to the dense input vector + * x + * + * @param[out] d_vector_y + * Pointer to the array of @p num_rows values corresponding to the dense output vector + * y + * + * @param[in] num_rows + * number of rows of matrix A. + * + * @param[in] num_cols + * number of columns of matrix A. + * + * @param[in] num_nonzeros + * number of nonzero elements of matrix A. * - * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) + * @param[in] stream + * [optional] CUDA stream to launch kernels within. Default is stream0. */ - template < - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t CsrMV( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - const int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) - const int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - const ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows, ///< [in] number of rows of matrix A. - int num_cols, ///< [in] number of columns of matrix A. - int num_nonzeros, ///< [in] number of nonzero elements of matrix A. - cudaStream_t stream = 0) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + template + CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(void *d_temp_storage, + size_t &temp_storage_bytes, + const ValueT *d_values, + const int *d_row_offsets, + const int *d_column_indices, + const ValueT *d_vector_x, + ValueT *d_vector_y, + int num_rows, + int num_cols, + int num_nonzeros, + cudaStream_t stream = 0) { SpmvParams spmv_params; spmv_params.d_values = d_values; diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index aef8df7a8da..ae72a22d748 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + * @file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across + * a sequence of data items residing within device-accessible memory. */ #pragma once @@ -74,26 +75,59 @@ CUB_NAMESPACE_BEGIN *****************************************************************************/ /** - * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. + * @brief Upsweep digit-counting kernel entry point (multi-block). + * Computes privatized digit histograms, one per block. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam ALT_DIGIT_BITS + * Whether or not to use the alternate (lower-bits) policy + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * Key type + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in] d_keys + * Input keys buffer + * + * @param[out] d_spine + * Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, + * then 1s counts from each block, etc.) + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] current_bit + * Bit position of current radix digit + * + * @param[in] num_bits + * Number of bits of current radix digit + * + * @param[in] even_share + * Even-share descriptor for mapan equal number of tiles onto each thread block */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : - int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel( - const KeyT *d_keys, ///< [in] Input keys buffer - OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT /*num_items*/, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share, ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block - DecomposerT decomposer = {}) +template +__launch_bounds__(int((ALT_DIGIT_BITS) + ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) + : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(const KeyT *d_keys, + OffsetT *d_spine, + OffsetT /*num_items*/, + int current_bit, + int num_bits, + GridEvenShare even_share, + DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = cub::detail::conditional_t< @@ -137,17 +171,27 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel( upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); } - /** - * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + * @brief Spine scan kernel entry point (single-block). + * Computes an exclusive prefix sum over the privatized digit histograms + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in,out] d_spine + * Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, + * then 1s counts from each block, etc.) + * + * @param[in] num_counts + * Total number of bin-counts */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) -CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( - OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - int num_counts) ///< [in] Total number of bin-counts +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), + 1) CUB_DETAIL_KERNEL_ATTRIBUTES + void RadixSortScanBinsKernel(OffsetT *d_spine, int num_counts) { // Parameterize the AgentScan type for the current configuration typedef AgentScan< @@ -183,32 +227,77 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( } } - /** - * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + * @brief Downsweep pass kernel entry point (multi-block). + * Scatters keys (and values) into corresponding bins for the current digit place. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam ALT_DIGIT_BITS + * Whether or not to use the alternate (lower-bits) policy + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * Key type + * + * @tparam ValueT + * Value type + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in] d_keys_in + * Input keys buffer + * + * @param[in] d_keys_out + * Output keys buffer + * + * @param[in] d_values_in + * Input values buffer + * + * @param[in] d_values_out + * Output values buffer + * + * @param[in] d_spine + * Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, + * then 1s counts from each block, etc.) + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] current_bit + * Bit position of current radix digit + * + * @param[in] num_bits + * Number of bits of current radix digit + * + * @param[in] even_share + * Even-share descriptor for mapan equal number of tiles onto each thread block */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : - int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share, ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block - DecomposerT decomposer = {}) +template +__launch_bounds__(int((ALT_DIGIT_BITS) + ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) + : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) + CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceRadixSortDownsweepKernel(const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT *d_spine, + OffsetT num_items, + int current_bit, + int num_bits, + GridEvenShare even_share, + DecomposerT decomposer = {}) { using ActiveUpsweepPolicyT = cub::detail::conditional_t< @@ -250,27 +339,62 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel( even_share.block_end); } - /** - * Single pass kernel entry point (single-block). Fully sorts a tile of input. + * @brief Single pass kernel entry point (single-block). + * Fully sorts a tile of input. + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * Key type + * + * @tparam ValueT + * Value type + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in] d_keys_in + * Input keys buffer + * + * @param[in] d_keys_out + * Output keys buffer + * + * @param[in] d_values_in + * Input values buffer + * + * @param[in] d_values_out + * Output values buffer + * + * @param[in] num_items + * Total number of input data items + * + * @param[in] current_bit + * Bit position of current radix digit + * + * @param[in] end_bit + * The past-the-end (most-significant) bit index needed for key comparison */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - DecomposerT decomposer = {}) +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), + 1) CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceRadixSortSingleTileKernel(const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT num_items, + int current_bit, + int end_bit, + DecomposerT decomposer = {}) { // Constants enum @@ -370,34 +494,89 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( } } - /** - * Segmented radix sorting pass (one block per segment) + * @brief Segmented radix sorting pass (one block per segment) + * + * @tparam ChainedPolicyT + * Chained tuning policy + * + * @tparam ALT_DIGIT_BITS + * Whether or not to use the alternate (lower-bits) policy + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * Key type + * + * @tparam ValueT + * Value type + * + * @tparam BeginOffsetIteratorT + * Random-access input iterator type for reading segment beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * Random-access input iterator type for reading segment ending offsets \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in] d_keys_in + * Input keys buffer + * + * @param[in] d_keys_out + * Output keys buffer + * + * @param[in] d_values_in + * Input values buffer + * + * @param[in] d_values_out + * Output values buffer + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of length @p num_segments, + * such that d_begin_offsets[i] is the first element of the ith + * data segment in d_keys_* and d_values_* + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length @p num_segments, + * such that d_end_offsets[i]-1 is the last element of the ith + * data segment in d_keys_* and d_values_*. + * If d_end_offsets[i]-1 <= d_begin_offsets[i], + * the ith is considered empty. + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] current_bit + * Bit position of current radix digit + * + * @param[in] pass_bits + * Number of bits of current radix digit */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator - typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename DecomposerT = detail::identity_decomposer_t> -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - int current_bit, ///< [in] Bit position of current radix digit - int pass_bits, ///< [in] Number of bits of current radix digit - DecomposerT decomposer = {}) +template +__launch_bounds__(int((ALT_DIGIT_BITS) + ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS + : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceSegmentedRadixSortKernel(const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + int /*num_segments*/, + int current_bit, + int pass_bits, + DecomposerT decomposer = {}) { // // Constants @@ -695,12 +874,18 @@ template <> struct sm90_small_key_tuning<2, 16, 8> { static constexpr int thread ******************************************************************************/ /** - * Tuning policy for kernel specialization + * @brief Tuning policy for kernel specialization + * + * @tparam KeyT + * Key type + * + * @tparam ValueT + * Value type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets +template struct DeviceRadixSortPolicy { //------------------------------------------------------------------------------ @@ -1192,17 +1377,41 @@ struct DispatchRadixSort : SelectedPolicy // Problem state //------------------------------------------------------------------------------ - void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - DecomposerT decomposer; + /// Device-accessible allocation of temporary storage. + // When NULL, the required allocation size is written to @p temp_storage_bytes and no work is + // done. + void *d_temp_storage; + + /// Reference to size in bytes of @p d_temp_storage allocation + size_t &temp_storage_bytes; + + /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is + /// updated to point to the sorted output keys + DoubleBuffer &d_keys; + + /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is + /// updated to point to the sorted output values + DoubleBuffer &d_values; + + /// Number of items to sort + OffsetT num_items; + + /// The beginning (least-significant) bit index needed for key comparison + int begin_bit; + + /// The past-the-end (most-significant) bit index needed for key comparison + int end_bit; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /// PTX version + int ptx_version; + + /// Whether is okay to overwrite source buffers + bool is_overwrite_okay; + + DecomposerT decomposer; //------------------------------------------------------------------------------ @@ -1271,13 +1480,21 @@ struct DispatchRadixSort : SelectedPolicy // Small-problem (single tile) invocation //------------------------------------------------------------------------------ - /// Invoke a single block to sort in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + /** + * @brief Invoke a single block to sort in-core + * + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam SingleTileKernelT + * Function type of cub::DeviceRadixSortSingleTileKernel + * + * @param[in] single_tile_kernel + * Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokeSingleTile(SingleTileKernelT single_tile_kernel) { cudaError error = cudaSuccess; do @@ -1786,19 +2003,47 @@ struct DispatchRadixSort : SelectedPolicy return error; } - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel - typename ScanKernelT, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel - DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + /** + * @brief Invocation (run multiple digit passes) + * + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam UpsweepKernelT + * Function type of cub::DeviceRadixSortUpsweepKernel + * + * @tparam ScanKernelT + * Function type of cub::SpineScanKernel + * + * @tparam DownsweepKernelT + * Function type of cub::DeviceRadixSortDownsweepKernel + * + * @param[in] upsweep_kernel + * Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + * + * @param[in] alt_upsweep_kernel + * Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + * + * @param[in] scan_kernel + * Kernel function pointer to parameterization of cub::SpineScanKernel + * + * @param[in] downsweep_kernel + * Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + * + * @param[in] alt_downsweep_kernel + * Alternate kernel function pointer to parameterization of + * cub::DeviceRadixSortDownsweepKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokePasses(UpsweepKernelT upsweep_kernel, + UpsweepKernelT alt_upsweep_kernel, + ScanKernelT scan_kernel, + DownsweepKernelT downsweep_kernel, + DownsweepKernelT alt_downsweep_kernel) { cudaError error = cudaSuccess; do @@ -1858,9 +2103,14 @@ struct DispatchRadixSort : SelectedPolicy void* allocations[3] = {}; size_t allocation_sizes[3] = { - spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + // bytes needed for privatized block digit histograms + spine_length * sizeof(OffsetT), + + // bytes needed for 3rd keys buffer + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), + + // bytes needed for 3rd values buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) @@ -2087,20 +2337,48 @@ struct DispatchRadixSort : SelectedPolicy //------------------------------------------------------------------------------ /** - * Internal dispatch routine + * @brief Internal dispatch routine + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When NULL, the required + * allocation size is written to @p temp_storage_bytes and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of @p d_temp_storage allocation + * + * @param[in,out] d_keys + * Double-buffer whose current buffer contains the unsorted input keys and, + * upon return, is updated to point to the sorted output keys + * + * @param[in,out] d_values + * Double-buffer whose current buffer contains the unsorted input values and, + * upon return, is updated to point to the sorted output values + * + * @param[in] num_items + * Number of items to sort + * + * @param[in] begin_bit + * The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * The past-the-end (most-significant) bit index needed for key comparison + * + * @param[in] is_overwrite_okay + * Whether is okay to overwrite source buffers + * + * @param[in] stream + * CUDA stream to launch kernels within. Default is stream0. */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items, ///< [in] Number of items to sort - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - DecomposerT decomposer = {}) + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + DecomposerT decomposer = {}) { typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; @@ -2174,17 +2452,35 @@ struct DispatchRadixSort : SelectedPolicy ******************************************************************************/ /** - * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort + * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide + * radix sort + * + * @tparam IS_DESCENDING + * Whether or not the sorted-order is high-to-low + * + * @tparam KeyT + * Key type + * + * @tparam ValueT + * Value type + * + * @tparam BeginOffsetIteratorT + * Random-access input iterator type for reading segment beginning offsets \iterator + * + * @tparam EndOffsetIteratorT + * Random-access input iterator type for reading segment ending offsets \iterator + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename BeginOffsetIteratorT, ///< Random-access input iterator type for reading segment beginning offsets \iterator - typename EndOffsetIteratorT, ///< Random-access input iterator type for reading segment ending offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename SelectedPolicy = DeviceRadixSortPolicy, - typename DecomposerT = detail::identity_decomposer_t> +template , + typename DecomposerT = detail::identity_decomposer_t> struct DispatchSegmentedRadixSort : SelectedPolicy { //------------------------------------------------------------------------------ @@ -2198,21 +2494,54 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Parameter members //------------------------------------------------------------------------------ - void *d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets; ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets; ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - DecomposerT decomposer; + /// Device-accessible allocation of temporary storage. When NULL, the required allocation size + /// is written to @p temp_storage_bytes and no work is done. + void *d_temp_storage; + + /// Reference to size in bytes of @p d_temp_storage allocation + size_t &temp_storage_bytes; + + /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is + /// updated to point to the sorted output keys + DoubleBuffer &d_keys; + + /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is + /// updated to point to the sorted output values + DoubleBuffer &d_values; + + /// Number of items to sort + OffsetT num_items; + /// The number of segments that comprise the sorting data + OffsetT num_segments; + + /// Random-access input iterator to the sequence of beginning offsets of length @p num_segments, + /// such that d_begin_offsets[i] is the first element of the ith + /// data segment in d_keys_* and d_values_* + BeginOffsetIteratorT d_begin_offsets; + + /// Random-access input iterator to the sequence of ending offsets of length @p num_segments, + /// such that d_end_offsets[i]-1 is the last element of the ith + /// data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 + /// <= d_begin_offsets[i], the ith is considered empty. + EndOffsetIteratorT d_end_offsets; + + /// The beginning (least-significant) bit index needed for key comparison + int begin_bit; + + /// The past-the-end (most-significant) bit index needed for key comparison + int end_bit; + + /// CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /// PTX version + int ptx_version; + + /// Whether is okay to overwrite source buffers + bool is_overwrite_okay; + + DecomposerT decomposer; //------------------------------------------------------------------------------ // Constructors @@ -2220,36 +2549,34 @@ struct DispatchSegmentedRadixSort : SelectedPolicy /// Constructor CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedRadixSort( - void* d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - OffsetT num_items, - OffsetT num_segments, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - int ptx_version, - DecomposerT decomposer = {}) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys(d_keys), - d_values(d_values), - num_items(num_items), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - begin_bit(begin_bit), - end_bit(end_bit), - stream(stream), - ptx_version(ptx_version), - is_overwrite_okay(is_overwrite_okay), - decomposer(decomposer) + DispatchSegmentedRadixSort(void *d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + OffsetT num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + int ptx_version, + DecomposerT decomposer = {}) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_keys(d_keys) + , d_values(d_values) + , num_items(num_items) + , num_segments(num_segments) + , d_begin_offsets(d_begin_offsets) + , d_end_offsets(d_end_offsets) + , begin_bit(begin_bit) + , end_bit(end_bit) + , stream(stream) + , ptx_version(ptx_version) + , is_overwrite_okay(is_overwrite_okay) + , decomposer(decomposer) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED @@ -2376,15 +2703,25 @@ struct DispatchSegmentedRadixSort : SelectedPolicy } }; - - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel - SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + /** + * @brief Invocation (run multiple digit passes) + * + * @tparam ActivePolicyT + * Umbrella policy active for the target device + * + * @tparam SegmentedKernelT + * Function type of cub::DeviceSegmentedRadixSortKernel + * + * @param[in] segmented_kernel + * Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + * + * @param[in] alt_segmented_kernel + * Alternate kernel function pointer to parameterization of + * cub::DeviceSegmentedRadixSortKernel + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t + InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel) { cudaError error = cudaSuccess; do @@ -2398,8 +2735,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy void* allocations[2] = {}; size_t allocation_sizes[2] = { - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + // bytes needed for 3rd keys buffer + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), + + // bytes needed for 3rd values buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), }; // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) @@ -2515,22 +2855,67 @@ struct DispatchSegmentedRadixSort : SelectedPolicy // Dispatch entrypoints //------------------------------------------------------------------------------ - - /// Internal dispatch routine - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int num_segments, ///< [in] The number of segments that comprise the sorting data - BeginOffsetIteratorT d_begin_offsets, ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - EndOffsetIteratorT d_end_offsets, ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream) ///< [in] CUDA stream to launch kernels within. Default is stream0. + /** + * @brief Internal dispatch routine + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. When NULL, the required allocation size + * is written to @p temp_storage_bytes and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of @p d_temp_storage allocation + * + * @param[in,out] d_keys + * Double-buffer whose current buffer contains the unsorted input keys and, upon return, is + * updated to point to the sorted output keys + * + * @param[in,out] d_values + * Double-buffer whose current buffer contains the unsorted input values and, upon return, is + * updated to point to the sorted output values + * + * @param[in] num_items + * Number of items to sort + * + * @param[in] num_segments + * The number of segments that comprise the sorting data + * + * @param[in] d_begin_offsets + * Random-access input iterator to the sequence of beginning offsets of length + * @p num_segments, such that d_begin_offsets[i] is the first element of the + * ith data segment in d_keys_* and d_values_* + * + * @param[in] d_end_offsets + * Random-access input iterator to the sequence of ending offsets of length @p num_segments, + * such that d_end_offsets[i]-1 is the last element of the ith + * data segment in d_keys_* and d_values_*. + * If d_end_offsets[i]-1 <= d_begin_offsets[i], + * the ith is considered empty. + * + * @param[in] begin_bit + * The beginning (least-significant) bit index needed for key comparison + * + * @param[in] end_bit + * The past-the-end (most-significant) bit index needed for key comparison + * + * @param[in] is_overwrite_okay + * Whether is okay to overwrite source buffers + * + * @param[in] stream + * CUDA stream to launch kernels within. Default is stream0. + */ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + int num_items, + int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream) { typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index fda0d518d58..119d1e33e39 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -28,8 +28,9 @@ ******************************************************************************/ /** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + * @file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector + * multiplication (SpMV). */ #pragma once @@ -69,13 +70,22 @@ CUB_NAMESPACE_BEGIN *****************************************************************************/ /** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. + * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. + * + * @tparam AgentSpmvPolicyT + * Parameterized SpmvPolicy tuning policy type + * + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + * + * @param[in] spmv_params + * SpMV input parameter bundle */ -template ///< Signed integer type for sequence offsets -CUB_DETAIL_KERNEL_ATTRIBUTES void -DeviceSpmv1ColKernel(SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +template +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams spmv_params) { typedef CacheModifiedInputIterator VectorValueIteratorT; @@ -100,17 +110,33 @@ DeviceSpmv1ColKernel(SpmvParams spmv_params) ///< [in] SpMV inp } /** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. + * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile. + * + * @tparam SpmvPolicyT + * Parameterized SpmvPolicy tuning policy type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + * + * @tparam CoordinateT + * Merge path coordinate type + * + * @tparam SpmvParamsT + * SpmvParams type + * + * @param[in] num_merge_tiles + * Number of SpMV merge tiles (spmv grid size) + * + * @param[out] d_tile_coordinates + * Pointer to the temporary array of tile starting coordinates + * + * @param[in] spmv_params + * SpMV input parameter bundle */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - typename SpmvParamsT> ///< SpmvParams type -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( - int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) - CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates - SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle +template +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(int num_merge_tiles, + CoordinateT *d_tile_coordinates, + SpmvParamsT spmv_params) { /// Constants enum @@ -148,26 +174,62 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( } } - /** - * Spmv agent entry point + * @brief Spmv agent entry point + * + * @tparam SpmvPolicyT + * Parameterized SpmvPolicy tuning policy type + * + * @tparam ScanTileStateT + * Tile status interface type + * + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + * + * @tparam CoordinateT + * Merge path coordinate type + * + * @tparam HAS_ALPHA + * Whether the input parameter Alpha is 1 + * + * @tparam HAS_BETA + * Whether the input parameter Beta is 0 + * + * @param[in] spmv_params + * SpMV input parameter bundle + * + * @param[in] d_tile_coordinates + * Pointer to the temporary array of tile starting coordinates + * + * @param[out] d_tile_carry_pairs + * Pointer to the temporary array carry-out dot product row-ids, one per block + * + * @param[in] num_tiles + * Number of merge tiles + * + * @param[in] tile_state + * Tile status interface for fixup reduce-by-key kernel + * + * @param[in] num_segment_fixup_tiles + * Number of reduce-by-key tiles (fixup grid size) */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ScanTileStateT, ///< Tile status interface type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 - bool HAS_BETA> ///< Whether the input parameter Beta is 0 -__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( - SpmvParams spmv_params, ///< [in] SpMV input parameter bundle - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_tiles, ///< [in] Number of merge tiles - ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel - int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) +template +__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceSpmvKernel(SpmvParams spmv_params, + CoordinateT *d_tile_coordinates, + KeyValuePair *d_tile_carry_pairs, + int num_tiles, + ScanTileStateT tile_state, + int num_segment_fixup_tiles) { // Spmv agent type specialization typedef AgentSpmv< @@ -191,9 +253,17 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( } -template ///< Whether the input parameter Beta is 0 +/** + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for sequence offsets + * + * @tparam HAS_BETA + * Whether the input parameter Beta is 0 + */ +template CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) { @@ -213,21 +283,49 @@ DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) } /** - * Multi-block reduce-by-key sweep kernel entry point + * @brief Multi-block reduce-by-key sweep kernel entry point + * + * @tparam AgentSegmentFixupPolicyT + * Parameterized AgentSegmentFixupPolicy tuning policy type + * + * @tparam PairsInputIteratorT + * Random-access input iterator type for keys + * + * @tparam AggregatesOutputIteratorT + * Random-access output iterator type for values + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @tparam ScanTileStateT + * Tile status interface type + * + * @param[in] d_pairs_in + * Pointer to the array carry-out dot product row-ids, one per spmv block + * + * @param[in,out] d_aggregates_out + * Output value aggregates + * + * @param[in] num_items + * Total number of items to select from + * + * @param[in] num_tiles + * Total number of tiles for the entire problem + * + * @param[in] tile_state + * Tile status interface */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename OffsetT, ///< Signed integer type for global offsets - typename ScanTileStateT> ///< Tile status interface type -__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( - PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block - AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates - OffsetT num_items, ///< [in] Total number of items to select from - int num_tiles, ///< [in] Total number of tiles for the entire problem - ScanTileStateT tile_state) ///< [in] Tile status interface +template +__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceSegmentFixupKernel(PairsInputIteratorT d_pairs_in, + AggregatesOutputIteratorT d_aggregates_out, + OffsetT num_items, + int num_tiles, + ScanTileStateT tile_state) { // Thread block type for reducing tiles of value segments typedef AgentSegmentFixup< @@ -255,11 +353,15 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( ******************************************************************************/ /** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + * + * @tparam ValueT + * Matrix and vector value type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for global offsets +template struct DispatchSpmv { //--------------------------------------------------------------------- @@ -489,26 +591,74 @@ struct DispatchSpmv * * If the input is larger than a single tile, this method uses two-passes of * kernel invocations. + * + * @tparam Spmv1ColKernelT + * Function type of cub::DeviceSpmv1ColKernel + * + * @tparam SpmvSearchKernelT + * Function type of cub::AgentSpmvSearchKernel + * + * @tparam SpmvKernelT + * Function type of cub::AgentSpmvKernel + * + * @tparam SegmentFixupKernelT + * Function type of cub::DeviceSegmentFixupKernelT + * + * @tparam SpmvEmptyMatrixKernelT + * Function type of cub::DeviceSpmvEmptyMatrixKernel + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to + * @p temp_storage_bytes and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of \p d_temp_storage allocation + * + * @paramSpMV spmv_params + * input parameter bundle + * + * @param[in] stream + * CUDA stream to launch kernels within. Default is stream0. + * + * @param[in] spmv_1col_kernel + * Kernel function pointer to parameterization of DeviceSpmv1ColKernel + * + * @param[in] spmv_search_kernel + * Kernel function pointer to parameterization of AgentSpmvSearchKernel + * + * @param[in] spmv_kernel + * Kernel function pointer to parameterization of AgentSpmvKernel + * + * @param[in] segment_fixup_kernel + * Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel + * + * @param[in] spmv_empty_matrix_kernel + * Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel + * + * @param[in] spmv_config + * Dispatch parameters that match the policy that @p spmv_kernel was compiled for + * + * @param[in] segment_fixup_config + * Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for */ - template < - typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel - typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel - typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel - typename SegmentFixupKernelT, ///< Function type of cub::DeviceSegmentFixupKernelT - typename SpmvEmptyMatrixKernelT> ///< Function type of cub::DeviceSpmvEmptyMatrixKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel - SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel - SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel - SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel - SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel - KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for - KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for + template + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + SpmvParamsT &spmv_params, + cudaStream_t stream, + Spmv1ColKernelT spmv_1col_kernel, + SpmvSearchKernelT spmv_search_kernel, + SpmvKernelT spmv_kernel, + SegmentFixupKernelT segment_fixup_kernel, + SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel, + KernelConfig spmv_config, + KernelConfig segment_fixup_config) { cudaError error = cudaSuccess; do @@ -821,14 +971,26 @@ struct DispatchSpmv } /** - * Internal dispatch routine for computing a device-wide reduction + * @brief Internal dispatch routine for computing a device-wide reduction + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to + * @p temp_storage_bytes and no work is done. + * + * @param[in,out] temp_storage_bytes + * Reference to size in bytes of @p d_temp_storage allocation + * + * @param SpMV spmv_params + * input parameter bundle + * + * @param[in] stream + * [optional] CUDA stream to launch kernels within. Default is stream0. */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream = 0) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + SpmvParamsT &spmv_params, + cudaStream_t stream = 0) { cudaError error = cudaSuccess; do @@ -875,5 +1037,3 @@ struct DispatchSpmv CUB_NAMESPACE_END - - diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index 36ab55be61c..c006e359044 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory. + * @file + * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique + * items by key from sequences of data items residing within device-accessible memory. */ #include @@ -47,29 +48,81 @@ CUB_NAMESPACE_BEGIN *****************************************************************************/ /** - * Unique by key kernel entry point (multi-block) + * @brief Unique by key kernel entry point (multi-block) + * + * @tparam KeyInputIteratorT + * Random-access input iterator type for keys + * + * @tparam ValueInputIteratorT + * Random-access input iterator type for values + * + * @tparam KeyOutputIteratorT + * Random-access output iterator type for keys + * + * @tparam ValueOutputIteratorT + * Random-access output iterator type for values + * + * @tparam NumSelectedIteratorT + * Output iterator type for recording the number of items selected + * + * @tparam ScanTileStateT + * Tile status interface type + * + * @tparam EqualityOpT + * Equality operator type + * + * @tparam OffsetT + * Signed integer type for global offsets + * + * @param[in] d_keys_in + * Pointer to the input sequence of keys + * + * @param[in] d_values_in + * Pointer to the input sequence of values + * + * @param[out] d_keys_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_values_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_num_selected_out + * Pointer to the total number of items selected + * (i.e., length of @p d_keys_out or @p d_values_out) + * + * @param[in] tile_state + * Tile status interface + * + * @param[in] equality_op + * Equality operator + * + * @param[in] num_items + * Total number of input items + * (i.e., length of @p d_keys_in or @p d_values_in) + * + * @param[in] num_tiles + * Total number of tiles for the entire problem */ -template < - typename ChainedPolicyT, - typename KeyInputIteratorT, ///< Random-access input iterator type for keys - typename ValueInputIteratorT, ///< Random-access input iterator type for values - typename KeyOutputIteratorT, ///< Random-access output iterator type for keys - typename ValueOutputIteratorT, ///< Random-access output iterator type for values - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename ScanTileStateT, ///< Tile status interface type - typename EqualityOpT, ///< Equality operator type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS)) -CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( - KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values - KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items - ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) - ScanTileStateT tile_state, ///< [in] Tile status interface - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) - int num_tiles) ///< [in] Total number of tiles for the entire problem +template +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceUniqueByKeySweepKernel(KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + ScanTileStateT tile_state, + EqualityOpT equality_op, + OffsetT num_items, + int num_tiles) { using AgentUniqueByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT; @@ -98,17 +151,37 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( ******************************************************************************/ /** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + * + * @tparam KeyInputIteratorT + * Random-access input iterator type for keys + * + * @tparam ValueInputIteratorT + * Random-access input iterator type for values + * + * @tparam KeyOutputIteratorT + * Random-access output iterator type for keys + * + * @tparam ValueOutputIteratorT + * Random-access output iterator type for values + * + * @tparam NumSelectedIteratorT + * Output iterator type for recording the number of items selected + * + * @tparam EqualityOpT + * Equality operator type + * + * @tparam OffsetT + * Signed integer type for global offsets */ -template < - typename KeyInputIteratorT, ///< Random-access input iterator type for keys - typename ValueInputIteratorT, ///< Random-access input iterator type for values - typename KeyOutputIteratorT, ///< Random-access output iterator type for keys - typename ValueOutputIteratorT, ///< Random-access output iterator type for values - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename EqualityOpT, ///< Equality operator type - typename OffsetT, ///< Signed integer type for global offsets - typename SelectedPolicy = DeviceUniqueByKeyPolicy> +template > struct DispatchUniqueByKey : SelectedPolicy { /****************************************************************************** @@ -127,41 +200,93 @@ struct DispatchUniqueByKey : SelectedPolicy // Tile status descriptor interface type using ScanTileStateT = ScanTileState; + /// Device-accessible allocation of temporary storage. When NULL, the required allocation size + /// is written to @p temp_storage_bytes and no work is done. + void *d_temp_storage; - void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIteratorT d_keys_in; ///< [in] Pointer to the input sequence of keys - ValueInputIteratorT d_values_in; ///< [in] Pointer to the input sequence of values - KeyOutputIteratorT d_keys_out; ///< [out] Pointer to the output sequence of selected data items - ValueOutputIteratorT d_values_out; ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out; ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) - EqualityOpT equality_op; ///< [in] Equality operator - OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) - cudaStream_t stream; ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + /// Reference to size in bytes of @p d_temp_storage allocation + size_t &temp_storage_bytes; - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchUniqueByKey( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values - KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items - ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) - cudaStream_t stream ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - ): - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys_in(d_keys_in), - d_values_in(d_values_in), - d_keys_out(d_keys_out), - d_values_out(d_values_out), - d_num_selected_out(d_num_selected_out), - equality_op(equality_op), - num_items(num_items), - stream(stream) + /// Pointer to the input sequence of keys + KeyInputIteratorT d_keys_in; + + /// Pointer to the input sequence of values + ValueInputIteratorT d_values_in; + + /// Pointer to the output sequence of selected data items + KeyOutputIteratorT d_keys_out; + + /// Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out; + + /// Pointer to the total number of items selected + /// (i.e., length of @p d_keys_out or @p d_values_out) + NumSelectedIteratorT d_num_selected_out; + + /// Equality operator + EqualityOpT equality_op; + + /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) + OffsetT num_items; + + /// [optional] CUDA stream to launch kernels within. Default is stream0. + cudaStream_t stream; + + /** + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to + * @p temp_storage_bytes and no work is done. + * + * @tparam temp_storage_bytes + * [in,out] Reference to size in bytes of @p d_temp_storage allocation + * + * @param[in] d_keys_in + * Pointer to the input sequence of keys + * + * @param[in] d_values_in + * Pointer to the input sequence of values + * + * @param[out] d_keys_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_values_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_num_selected_out + * Pointer to the total number of items selected + * (i.e., length of @p d_keys_out or @p d_values_out) + * + * @param[in] equality_op + * Equality operator + * + * @param[in] num_items + * Total number of input items (i.e., length of @p d_keys_in or @p d_values_in) + * + * @param[in] stream + * [optional] CUDA stream to launch kernels within. + * Default is stream0. + */ + CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey(void *d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + EqualityOpT equality_op, + OffsetT num_items, + cudaStream_t stream) + : d_temp_storage(d_temp_storage) + , temp_storage_bytes(temp_storage_bytes) + , d_keys_in(d_keys_in) + , d_values_in(d_values_in) + , d_keys_out(d_keys_out) + , d_values_out(d_values_out) + , d_num_selected_out(d_num_selected_out) + , equality_op(equality_op) + , num_items(num_items) + , stream(stream) {} CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED @@ -397,22 +522,54 @@ struct DispatchUniqueByKey : SelectedPolicy ); } - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values - KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items - ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream) ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + * @brief Internal dispatch routine + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to + * @p temp_storage_bytes and no work is done. + * + * @param[in,out] &temp_storage_bytes + * Reference to size in bytes of @p d_temp_storage allocation + * + * @param[in] d_keys_in + * Pointer to the input sequence of keys + * + * @param[in] d_values_in + * Pointer to the input sequence of values + * + * @param[out] d_keys_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_values_out + * Pointer to the output sequence of selected data items + * + * @param[out] d_num_selected_out + * Pointer to the total number of items selected + * (i.e., length of @p d_keys_out or @p d_values_out) + * + * @param[in] equality_op + * Equality operator + * + * @param[in] num_items + * Total number of input items (i.e., the length of @p d_in) + * + * @param[in] stream + * [optional] CUDA stream to launch kernels within. + * Default is stream0. + */ + CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t + Dispatch(void *d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + EqualityOpT equality_op, + OffsetT num_items, + cudaStream_t stream) { using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh index f86b885a5c8..bf990ecdddd 100644 --- a/cub/cub/grid/grid_even_share.cuh +++ b/cub/cub/grid/grid_even_share.cuh @@ -27,11 +27,12 @@ ******************************************************************************/ /** - * \file - * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). + * @file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an + * "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units + * (grains). */ - #pragma once #include "../config.cuh" @@ -50,17 +51,17 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup GridModule + * @addtogroup GridModule * @{ */ /** - * \brief GridEvenShare is a descriptor utility for distributing input among + * @brief GridEvenShare is a descriptor utility for distributing input among * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly * the same number of input tiles. * - * \par Overview + * @par Overview * Each thread block is assigned a consecutive sequence of input tiles. To help * preserve alignment and eliminate the overhead of guarded loads for all but the * last thread block, to GridEvenShare assigns one of three different amounts of @@ -69,7 +70,7 @@ CUB_NAMESPACE_BEGIN * last thread block may be partially-full if the input is not an even multiple of * the scheduling grain size. * - * \par + * @par * Before invoking a child grid, a parent thread will typically construct an * instance of GridEvenShare. The instance can be passed to child thread blocks * which can initialize their per-thread block offsets using \p BlockInit(). @@ -119,14 +120,22 @@ public: block_stride(0) {} - /** - * \brief Dispatch initializer. To be called prior prior to kernel launch. + * @brief Dispatch initializer. To be called prior prior to kernel launch. + * + * @param num_items_ + * Total number of input items + * + * @param max_grid_size + * Maximum grid size allowable (actual grid size may be less if not warranted by the the + * number of input items) + * + * @param tile_items + * Number of data items per input tile */ - __host__ __device__ __forceinline__ void DispatchInit( - OffsetT num_items_, ///< Total number of input items - int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) - int tile_items) ///< Number of data items per input tile + __host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items_, + int max_grid_size, + int tile_items) { this->block_offset = num_items_; // Initialize past-the-end this->block_end = num_items_; // Initialize past-the-end @@ -141,16 +150,14 @@ public: this->big_share_items = normal_share_items + tile_items; } - /** - * \brief Initializes ranges for the specified thread block index. Specialized - * for a "raking" access pattern in which each thread block is assigned a - * consecutive sequence of input tiles. + * @brief Initializes ranges for the specified thread block index. Specialized + * for a "raking" access pattern in which each thread block is assigned a + * consecutive sequence of input tiles. */ template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) + __device__ __forceinline__ void BlockInit(int block_id, + Int2Type /*strategy_tag*/) { block_stride = TILE_ITEMS; if (block_id < big_shares) @@ -169,46 +176,44 @@ public: // Else default past-the-end } - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. + * @brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. */ template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) + __device__ __forceinline__ void BlockInit(int block_id, + Int2Type /*strategy_tag*/) { block_stride = grid_size * TILE_ITEMS; block_offset = (block_id * TILE_ITEMS); block_end = num_items; } - /** - * \brief Block-initialization, specialized for "strip mining" access - * pattern in which the input tiles assigned to each thread block are - * separated by a stride equal to the the extent of the grid. + * @brief Block-initialization, specialized for "strip mining" access + * pattern in which the input tiles assigned to each thread block are + * separated by a stride equal to the the extent of the grid. */ - template < - int TILE_ITEMS, - GridMappingStrategy STRATEGY> + template __device__ __forceinline__ void BlockInit() { BlockInit(blockIdx.x, Int2Type()); } - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. + * @brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. + * + * @param[in] block_offset + * Threadblock begin offset (inclusive) + * + * @param[in] block_end + * Threadblock end offset (exclusive) */ template - __device__ __forceinline__ void BlockInit( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) + __device__ __forceinline__ void BlockInit(OffsetT block_offset, OffsetT block_end) { this->block_offset = block_offset; this->block_end = block_end; @@ -220,8 +225,6 @@ public: - - /** @} */ // end group GridModule CUB_NAMESPACE_END diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh index 7a8f2887cce..be6d02ed2c7 100644 --- a/cub/cub/grid/grid_queue.cuh +++ b/cub/cub/grid/grid_queue.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * cub::GridQueue is a descriptor utility for dynamic queue management. */ @@ -49,26 +49,26 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup GridModule + * @addtogroup GridModule * @{ */ /** - * \brief GridQueue is a descriptor utility for dynamic queue management. + * @brief GridQueue is a descriptor utility for dynamic queue management. * - * \par Overview + * @par Overview * GridQueue descriptors provides abstractions for "filling" or * "draining" globally-shared vectors. * - * \par + * @par * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, * returning a unique offset for the calling thread to write its items. * The GridQueue maintains the total "fill-size". The fill counter must be reset * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that * will be filling. * - * \par + * @par * Similarly, a "draining" GridQueue works by works by atomically-incrementing a * zero-initialized counter, returning a unique offset for the calling thread to * read its items. Threads can safely drain until the array's logical fill-size is @@ -77,11 +77,11 @@ CUB_NAMESPACE_BEGIN * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size * is simply the number of elements in the array.) * - * \par + * @par * Iterative work management can be implemented simply with a pair of flip-flopping * work buffers, each with an associated set of fill and drain GridQueue descriptors. * - * \tparam OffsetT Signed integer type for global offsets + * @tparam OffsetT Signed integer type for global offsets */ template class GridQueue @@ -114,16 +114,20 @@ public: d_counters(NULL) {} - - /// Constructs a GridQueue descriptor around the device storage allocation - __host__ __device__ __forceinline__ GridQueue( - void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). - : - d_counters((OffsetT*) d_storage) + /** + * @brief Constructs a GridQueue descriptor around the device storage allocation + * + * @param d_storage + * Device allocation to back the GridQueue. Must be at least as big as + * AllocationSize(). + */ + __host__ __device__ __forceinline__ GridQueue(void *d_storage) + : d_counters((OffsetT *)d_storage) {} - - /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. + /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for + /// draining in the next kernel instance. To be called by the host or by a kernel prior to that + /// which will be draining. __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( OffsetT fill_size, cudaStream_t stream = 0) @@ -146,8 +150,8 @@ public: return result; } - - /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. + /// This operation resets the drain so that it may advance to meet the existing fill-size. + /// To be called by the host or by a kernel prior to that which will be draining. __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; @@ -165,7 +169,8 @@ public: } - /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. + /// This operation resets the fill counter. + /// To be called by the host or by a kernel prior to that which will be filling. __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) { cudaError_t result = cudaErrorUnknown; @@ -203,14 +208,16 @@ public: } - /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. + /// Drain @p num_items from the queue. Returns offset from which to read items. + /// To be called from CUDA kernel. __device__ __forceinline__ OffsetT Drain(OffsetT num_items) { return atomicAdd(d_counters + DRAIN, num_items); } - /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. + /// Fill @p num_items into the queue. Returns offset from which to write items. + /// To be called from CUDA kernel. __device__ __forceinline__ OffsetT Fill(OffsetT num_items) { return atomicAdd(d_counters + FILL, num_items); diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh index d895a53e72f..99ea0da98d4 100644 --- a/cub/cub/iterator/arg_index_input_iterator.cuh +++ b/cub/cub/iterator/arg_index_input_iterator.cuh @@ -58,29 +58,29 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ - /** - * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). + * @brief A random-access input wrapper for pairing dereferenced values with their corresponding + * indices (forming \p KeyValuePair tuples). * - * \par Overview - * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. - * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose - * \p key field is \p i and whose \p value field is itr[i]. + * @par Overview + * - ArgIndexInputIteratorTwraps a random access input iterator @p itr of type @p InputIteratorT. + * Dereferencing an ArgIndexInputIteratorTat offset @p i produces a @p KeyValuePair value whose + * @p key field is @p i and whose @p value field is itr[i]. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto + * @par Snippet + * The code snippet below illustrates the use of @p ArgIndexInputIteratorTto * dereference an array of doubles - * \par - * \code + * @par + * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array @@ -102,11 +102,16 @@ CUB_NAMESPACE_BEGIN * item_offset_pair.value, * item_offset_pair.key); // 9.0 @ 6 * - * \endcode + * @endcode + * + * @tparam InputIteratorT + * The value type of the wrapped input iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) * - * \tparam InputIteratorT The value type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) + * @tparam OutputValueT + * The paired value type of the tuple (Default: value type of input iterator) */ template < typename InputIteratorT, @@ -117,22 +122,35 @@ class ArgIndexInputIterator public: // Required iterator traits - typedef ArgIndexInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef KeyValuePair value_type; ///< The type of the element the iterator can point to - typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to - typedef value_type reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef ArgIndexInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef KeyValuePair value_type; + + /// The type of a pointer to an element the iterator can point to + typedef value_type *pointer; + + /// The type of a reference to an element the iterator can point to + typedef value_type reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -142,21 +160,25 @@ private: public: - /// Constructor - __host__ __device__ __forceinline__ ArgIndexInputIterator( - InputIteratorT itr, ///< Input iterator to wrap - difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator - : - itr(itr), - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; + /** + * @param itr + * Input iterator to wrap + * + * @param offset + * OffsetT (in items) from @p itr denoting the position of the iterator + */ + __host__ __device__ __forceinline__ ArgIndexInputIterator(InputIteratorT itr, + difference_type offset = 0) + : itr(itr) + , offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; } /// Prefix increment diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh index b42e5b3cb49..fab19a66b34 100644 --- a/cub/cub/iterator/cache_modified_input_iterator.cuh +++ b/cub/cub/iterator/cache_modified_input_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -59,30 +59,30 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ - /** - * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. + * @brief A random-access input wrapper for dereferencing array values using a PTX cache load + * modifier. * - * \par Overview + * @par Overview * - CacheModifiedInputIterator is a random-access input iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by reading \p ValueType values through loads modified by \p MODIFIER. + * device pointer of type ValueType*. @p ValueType references are + * made by reading @p ValueType values through loads modified by @p MODIFIER. * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedInputIterator to + * @par Snippet + * The code snippet below illustrates the use of @p CacheModifiedInputIterator to * dereference a device array of double using the "ldg" PTX load modifier * (i.e., load values through texture cache). - * \par - * \code + * @par + * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array @@ -96,11 +96,16 @@ CUB_NAMESPACE_BEGIN * printf("%f\n", itr[1]); // 6.0 * printf("%f\n", itr[6]); // 9.0 * - * \endcode + * @endcode + * + * @tparam CacheLoadModifier + * The cub::CacheLoadModifier to use when accessing data + * + * @tparam ValueType + * The value type of this iterator * - * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < CacheLoadModifier MODIFIER, @@ -111,22 +116,35 @@ class CacheModifiedInputIterator public: // Required iterator traits - typedef CacheModifiedInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef CacheModifiedInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef ValueType value_type; + + /// The type of a pointer to an element the iterator can point to + typedef ValueType *pointer; + + /// The type of a reference to an element the iterator can point to + typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh index 92024af04d8..0e0c1fc0c01 100644 --- a/cub/cub/iterator/cache_modified_output_iterator.cuh +++ b/cub/cub/iterator/cache_modified_output_iterator.cuh @@ -58,30 +58,30 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. + * @brief A random-access output wrapper for storing array values using a PTX cache-modifier. * - * \par Overview + * @par Overview * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by writing \p ValueType values through stores modified by \p MODIFIER. + * device pointer of type ValueType*. @p ValueType references are + * made by writing @p ValueType values through stores modified by @p MODIFIER. * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", * "STORE_CG", "STORE_CS", "STORE_WT", etc.). * - Can be constructed, manipulated, and exchanged within and between host and device * functions, but can only be dereferenced within device functions. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to + * @par Snippet + * The code snippet below illustrates the use of @p CacheModifiedOutputIterator to * dereference a device array of doubles using the "wt" PTX load modifier * (i.e., write-through to system memory). - * \par - * \code + * @par + * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array @@ -95,14 +95,19 @@ CUB_NAMESPACE_BEGIN * itr[1] = 66.0; * itr[55] = 24.0; * - * \endcode + * @endcode * - * \par Usage Considerations + * @par Usage Considerations * - Can only be dereferenced within device code * - * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam CacheStoreModifier + * The cub::CacheStoreModifier to use when accessing data + * + * @tparam ValueType + * The value type of this iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < CacheStoreModifier MODIFIER, @@ -131,22 +136,35 @@ private: public: // Required iterator traits - typedef CacheModifiedOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef Reference reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef CacheModifiedOutputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef void value_type; + + /// The type of a pointer to an element the iterator can point to + typedef void pointer; + + /// The type of a reference to an element the iterator can point to + typedef Reference reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -154,13 +172,13 @@ private: ValueType* ptr; public: - - /// Constructor + /** + * @param ptr + * Native pointer to wrap + */ template - __host__ __device__ __forceinline__ CacheModifiedOutputIterator( - QualifiedValueType* ptr) ///< Native pointer to wrap - : - ptr(const_cast::type *>(ptr)) + __host__ __device__ __forceinline__ CacheModifiedOutputIterator(QualifiedValueType *ptr) + : ptr(const_cast::type *>(ptr)) {} /// Postfix increment diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh index 38816f5cbe1..86c1ce12753 100644 --- a/cub/cub/iterator/constant_input_iterator.cuh +++ b/cub/cub/iterator/constant_input_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -58,27 +58,27 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access input generator for dereferencing a sequence of homogeneous values + * @brief A random-access input generator for dereferencing a sequence of homogeneous values * - * \par Overview + * @par Overview * - Read references to a ConstantInputIteratorTiterator always return the supplied constant - * of type \p ValueType. + * of type @p ValueType. * - Can be used with any data type. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p ConstantInputIteratorTto + * @par Snippet + * The code snippet below illustrates the use of @p ConstantInputIteratorTto * dereference a sequence of homogeneous doubles. - * \par - * \code + * @par + * @code * #include // or equivalently * * cub::ConstantInputIterator itr(5.0); @@ -88,10 +88,13 @@ CUB_NAMESPACE_BEGIN * printf("%f\n", itr[2]); // 5.0 * printf("%f\n", itr[50]); // 5.0 * - * \endcode + * @endcode * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam ValueType + * The value type of this iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < typename ValueType, @@ -101,22 +104,35 @@ class ConstantInputIterator public: // Required iterator traits - typedef ConstantInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef ConstantInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef ValueType value_type; + + /// The type of a pointer to an element the iterator can point to + typedef ValueType *pointer; + + /// The type of a reference to an element the iterator can point to + typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -124,26 +140,29 @@ private: ValueType val; OffsetT offset; #ifdef _WIN32 - OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; #endif public: - - /// Constructor - __host__ __device__ __forceinline__ ConstantInputIterator( - ValueType val, ///< Starting value for the iterator instance to report - OffsetT offset = 0) ///< Base offset - : - val(val), - offset(offset) + /** + * @param val + * Starting value for the iterator instance to report + * + * @param offset + * Base offset + */ + __host__ __device__ __forceinline__ ConstantInputIterator(ValueType val, OffsetT offset = 0) + : val(val) + , offset(offset) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { - self_type retval = *this; - offset++; - return retval; + self_type retval = *this; + offset++; + return retval; } /// Prefix increment diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh index dc0108ac6f8..25284df1cdc 100644 --- a/cub/cub/iterator/counting_input_iterator.cuh +++ b/cub/cub/iterator/counting_input_iterator.cuh @@ -57,25 +57,25 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. + * @brief A random-access input generator for dereferencing a sequence of incrementing integer values. * - * \par Overview - * - After initializing a CountingInputIteratorTto a certain integer \p base, read references - * at \p offset will return the value \p base + \p offset. + * @par Overview + * - After initializing a CountingInputIteratorTto a certain integer @p base, read references + * at @p offset will return the value @p base + @p offset. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p CountingInputIteratorTto + * @par Snippet + * The code snippet below illustrates the use of @p CountingInputIteratorTto * dereference a sequence of incrementing integers. - * \par - * \code + * @par + * @code * #include // or equivalently * * cub::CountingInputIterator itr(5); @@ -85,10 +85,13 @@ CUB_NAMESPACE_BEGIN * printf("%d\n", itr[2]); // 7 * printf("%d\n", itr[50]); // 55 * - * \endcode + * @endcode * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam ValueType + * The value type of this iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < typename ValueType, @@ -98,22 +101,35 @@ class CountingInputIterator public: // Required iterator traits - typedef CountingInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef CountingInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef ValueType value_type; + + /// The type of a pointer to an element the iterator can point to + typedef ValueType *pointer; + + /// The type of a reference to an element the iterator can point to + typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -121,20 +137,20 @@ private: ValueType val; public: - - /// Constructor - __host__ __device__ __forceinline__ CountingInputIterator( - const ValueType &val) ///< Starting value for the iterator instance to report - : - val(val) + /** + * @param val + * Starting value for the iterator instance to report + */ + __host__ __device__ __forceinline__ CountingInputIterator(const ValueType &val) + : val(val) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { - self_type retval = *this; - val++; - return retval; + self_type retval = *this; + val++; + return retval; } /// Prefix increment diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh index 66e764412b5..33048473595 100644 --- a/cub/cub/iterator/discard_output_iterator.cuh +++ b/cub/cub/iterator/discard_output_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -55,13 +55,13 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A discard iterator + * @brief A discard iterator */ template class DiscardOutputIterator @@ -69,22 +69,35 @@ class DiscardOutputIterator public: // Required iterator traits - typedef DiscardOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef void reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef DiscardOutputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef void value_type; + + /// The type of a pointer to an element the iterator can point to + typedef void pointer; + + /// The type of a reference to an element the iterator can point to + typedef void reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -97,20 +110,20 @@ private: #endif public: - - /// Constructor - __host__ __device__ __forceinline__ DiscardOutputIterator( - OffsetT offset = 0) ///< Base offset - : - offset(offset) + /** + * @param offset + * Base offset + */ + __host__ __device__ __forceinline__ DiscardOutputIterator(OffsetT offset = 0) + : offset(offset) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { - self_type retval = *this; - offset++; - return retval; + self_type retval = *this; + offset++; + return retval; } /// Prefix increment diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh index 24cdd165a34..df86618e849 100644 --- a/cub/cub/iterator/tex_obj_input_iterator.cuh +++ b/cub/cub/iterator/tex_obj_input_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -60,16 +60,17 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. + * @brief A random-access input wrapper for dereferencing array values through texture cache. + * Uses newer Kepler-style texture objects. * - * \par Overview + * @par Overview * - TexObjInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. @@ -80,11 +81,11 @@ CUB_NAMESPACE_BEGIN * created by the host thread, but can be used by any descendant kernel. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p TexObjInputIterator to + * @par Snippet + * The code snippet below illustrates the use of @p TexObjInputIterator to * dereference a device array of doubles through texture cache. - * \par - * \code + * @par + * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array @@ -104,10 +105,13 @@ CUB_NAMESPACE_BEGIN * ... * itr.UnbindTexture(); * - * \endcode + * @endcode * - * \tparam T The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam T + * The value type of this iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < typename T, @@ -117,22 +121,35 @@ class TexObjInputIterator public: // Required iterator traits - typedef TexObjInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef T value_type; ///< The type of the element the iterator can point to - typedef T* pointer; ///< The type of a pointer to an element the iterator can point to - typedef T reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef TexObjInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef T value_type; + + /// The type of a pointer to an element the iterator can point to + typedef T *pointer; + + /// The type of a reference to an element the iterator can point to + typedef T reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::device_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -161,12 +178,20 @@ public: tex_obj(0) {} - /// Use this iterator to bind \p ptr with a texture reference + /** + * @brief Use this iterator to bind @p ptr with a texture reference + * + * @param ptr + * Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + * + * @param bytes + * Number of bytes in the range + * + * @param tex_offset + * OffsetT (in items) from @p ptr denoting the position of the iterator + */ template - cudaError_t BindTexture( - QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes, ///< Number of bytes in the range - size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator + cudaError_t BindTexture(QualifiedT *ptr, size_t bytes, size_t tex_offset = 0) { this->ptr = const_cast::type *>(ptr); this->tex_offset = static_cast(tex_offset); diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh index 67217a0f803..5a91fd2874f 100644 --- a/cub/cub/iterator/tex_ref_input_iterator.cuh +++ b/cub/cub/iterator/tex_ref_input_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -48,37 +48,37 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access input wrapper for dereferencing array values through texture cache. + * @brief A random-access input wrapper for dereferencing array values through texture cache. * - * \deprecated [Since 1.13.0] The CUDA texture management APIs used by + * @deprecated [Since 1.13.0] The CUDA texture management APIs used by * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead. * - * \par Overview + * @par Overview * - TexRefInputIterator wraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. - * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture + * - The @p UNIQUE_ID template parameter is used to statically name the underlying texture * reference. Only one TexRefInputIterator instance can be bound at any given time for a - * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host + * specific combination of (1) data type @p T, (2) @p UNIQUE_ID, (3) host * thread, and (4) compilation .o unit. * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be * created by the host thread and used by a top-level kernel (i.e. the one which is launched * from the host). * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIterator to + * @par Snippet + * The code snippet below illustrates the use of @p TexRefInputIterator to * dereference a device array of doubles through texture cache. - * \par - * \code + * @par + * @code * #include // or equivalently * * // Declare, allocate, and initialize a device array @@ -98,11 +98,16 @@ CUB_NAMESPACE_BEGIN * ... * itr.UnbindTexture(); * - * \endcode + * @endcode * - * \tparam T The value type of this iterator - * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam T + * The value type of this iterator + * + * @tparam UNIQUE_ID + * A globally-unique identifier (within the compilation unit) to name the underlying texture reference + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < typename T, diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh index de69a64ae76..f0396f53081 100644 --- a/cub/cub/iterator/transform_input_iterator.cuh +++ b/cub/cub/iterator/transform_input_iterator.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Random-access iterator types */ @@ -57,29 +57,29 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIterator + * @addtogroup UtilIterator * @{ */ /** - * \brief A random-access input wrapper for transforming dereferenced values. + * @brief A random-access input wrapper for transforming dereferenced values. * - * \par Overview - * - TransformInputIteratorTwraps a unary conversion functor of type \p - * ConversionOp and a random-access input iterator of type InputIteratorT, - * using the former to produce references of type \p ValueType from the latter. + * @par Overview + * - TransformInputIteratorTwraps a unary conversion functor of type + * @p ConversionOp and a random-access input iterator of type InputIteratorT, + * using the former to produce references of type @p ValueType from the latter. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped * device memory can only be dereferenced on the device. * - Compatible with Thrust API v1.7 or newer. * - * \par Snippet - * The code snippet below illustrates the use of \p TransformInputIteratorTto + * @par Snippet + * The code snippet below illustrates the use of @p TransformInputIteratorTto * dereference an array of integers, tripling the values and converting them to doubles. - * \par - * \code + * @par + * @code * #include // or equivalently * * // Functor for tripling integer values and converting to doubles @@ -103,13 +103,20 @@ CUB_NAMESPACE_BEGIN * printf("%f\n", itr[1]); // 18.0 * printf("%f\n", itr[6]); // 27.0 * - * \endcode + * @endcode * - * \tparam ValueType The value type of this iterator - * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). - * \tparam InputIteratorT The type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * @tparam ValueType + * The value type of this iterator * + * @tparam ConversionOp + * Unary functor type for mapping objects of type @p InputType to type @p ValueType. + * Must have member ValueType operator()(const InputType &datum). + * + * @tparam InputIteratorT + * The type of the wrapped input iterator + * + * @tparam OffsetT + * The difference type of this iterator (Default: @p ptrdiff_t) */ template < typename ValueType, @@ -121,22 +128,35 @@ class TransformInputIterator public: // Required iterator traits - typedef TransformInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + + /// My own type + typedef TransformInputIterator self_type; + + /// Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; + + /// The type of the element the iterator can point to + typedef ValueType value_type; + + /// The type of a pointer to an element the iterator can point to + typedef ValueType *pointer; + + /// The type of a reference to an element the iterator can point to + typedef ValueType reference; #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + + /// The iterator category typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category< THRUST_NS_QUALIFIER::any_system_tag, THRUST_NS_QUALIFIER::random_access_traversal_tag, value_type, reference - >::type iterator_category; ///< The iterator category + >::type iterator_category; #else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category + /// The iterator category + typedef std::random_access_iterator_tag iterator_category; #endif // THRUST_VERSION private: @@ -145,22 +165,25 @@ private: InputIteratorT input_itr; public: - - /// Constructor - __host__ __device__ __forceinline__ TransformInputIterator( - InputIteratorT input_itr, ///< Input iterator to wrap - ConversionOp conversion_op) ///< Conversion functor to wrap - : - conversion_op(conversion_op), - input_itr(input_itr) + /** + * @param input_itr + * Input iterator to wrap + * + * @param conversion_op + * Conversion functor to wrap + */ + __host__ __device__ __forceinline__ TransformInputIterator(InputIteratorT input_itr, + ConversionOp conversion_op) + : conversion_op(conversion_op) + , input_itr(input_itr) {} /// Postfix increment __host__ __device__ __forceinline__ self_type operator++(int) { - self_type retval = *this; - input_itr++; - return retval; + self_type retval = *this; + input_itr++; + return retval; } /// Prefix increment diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh index fc83bbe8c3a..8b4eafb6196 100644 --- a/cub/cub/thread/thread_load.cuh +++ b/cub/cub/thread/thread_load.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Thread utilities for reading memory using PTX cache modifiers. */ @@ -49,7 +49,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIo + * @addtogroup UtilIo * @{ */ @@ -58,30 +58,30 @@ CUB_NAMESPACE_BEGIN //----------------------------------------------------------------------------- /** - * \brief Enumeration of cache modifiers for memory load operations. + * @brief Enumeration of cache modifiers for memory load operations. */ enum CacheLoadModifier { - LOAD_DEFAULT, ///< Default (no modifier) - LOAD_CA, ///< Cache at all levels - LOAD_CG, ///< Cache at global level - LOAD_CS, ///< Cache streaming (likely to be accessed once) - LOAD_CV, ///< Cache as volatile (including cached system lines) - LOAD_LDG, ///< Cache as texture - LOAD_VOLATILE, ///< Volatile (any memory space) + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) }; - /** - * \name Thread I/O (cache modified) + * @name Thread I/O (cache modified) * @{ */ /** - * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * @brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. + * Can be used to load any data type. * - * \par Example - * \code + * @par Example + * @code * #include // or equivalently * * // 32-bit load using cache-global modifier: @@ -102,8 +102,11 @@ enum CacheLoadModifier * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); * \endcode * - * \tparam MODIFIER [inferred] CacheLoadModifier enumeration - * \tparam InputIteratorT [inferred] Input iterator type \iterator + * @tparam MODIFIER + * [inferred] CacheLoadModifier enumeration + * + * @tparam InputIteratorT + * [inferred] Input iterator type \iterator */ template @@ -339,7 +342,8 @@ __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, Int2Type /*is_primitive*/) { - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + // Word type for memcopying + typedef typename UnitWord::VolatileWord VolatileWord; constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh index 8384b0cc401..5e1ba043fd9 100644 --- a/cub/cub/thread/thread_reduce.cuh +++ b/cub/cub/thread/thread_reduce.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Thread utilities for sequential reduction over statically-sized array types */ @@ -50,19 +50,24 @@ CUB_NAMESPACE_BEGIN namespace internal { /** - * Sequential reduction over statically-sized array types + * @brief Sequential reduction over statically-sized array types + * + * @param[in] input + * Input array + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] prefix + * Prefix to seed reduction with */ -template < - int LENGTH, - typename T, - typename ReductionOp, - typename PrefixT, - typename AccumT = detail::accumulator_t> -__device__ __forceinline__ AccumT ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - PrefixT prefix, ///< [in] Prefix to seed reduction with - Int2Type /*length*/) +template > +__device__ __forceinline__ AccumT +ThreadReduce(T *input, ReductionOp reduction_op, PrefixT prefix, Int2Type /*length*/) { AccumT retval = prefix; @@ -73,85 +78,122 @@ __device__ __forceinline__ AccumT ThreadReduce( return retval; } - /** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array, + * seeded with the specified @p prefix. The aggregate is returned. + * + * @tparam LENGTH + * LengthT of input array + * + * @tparam T + * [inferred] The data type to be reduced. + * + * @tparam ReductionOp + * [inferred] Binary reduction operator type having member + * T operator()(const T &a, const T &b) * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * @param[in] input + * Input array + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] prefix + * Prefix to seed reduction with */ -template < - int LENGTH, - typename T, - typename ReductionOp, - typename PrefixT, - typename AccumT = detail::accumulator_t> -__device__ __forceinline__ AccumT ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - PrefixT prefix) ///< [in] Prefix to seed reduction with +template > +__device__ __forceinline__ AccumT ThreadReduce(T *input, ReductionOp reduction_op, PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } - /** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array. + * The aggregate is returned. + * + * @tparam LENGTH + * LengthT of input array + * + * @tparam T + * [inferred] The data type to be reduced. + * + * @tparam ReductionOp + * [inferred] Binary reduction operator type having member + * T operator()(const T &a, const T &b) * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * @param[in] input + * Input array + * + * @param[in] reduction_op + * Binary reduction operator */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator +template +__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op) { T prefix = input[0]; return ThreadReduce(input + 1, reduction_op, prefix); } - /** - * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential reduction over the statically-sized @p input array, + * seeded with the specified @p prefix. The aggregate is returned. + * + * @tparam LENGTH + * [inferred] LengthT of @p input array + * + * @tparam T + * [inferred] The data type to be reduced. + * + * @tparam ReductionOp + * [inferred] Binary reduction operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] prefix + * Prefix to seed reduction with */ -template < - int LENGTH, - typename T, - typename ReductionOp, - typename PrefixT, - typename AccumT = detail::accumulator_t> -__device__ __forceinline__ AccumT ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - PrefixT prefix) ///< [in] Prefix to seed reduction with +template > +__device__ __forceinline__ AccumT ThreadReduce(T (&input)[LENGTH], + ReductionOp reduction_op, + PrefixT prefix) { return ThreadReduce(input, reduction_op, prefix, Int2Type()); } - /** - * \brief Serial reduction with the specified operator + * @brief Serial reduction with the specified operator + * + * @tparam LENGTH + * [inferred] LengthT of @p input array + * + * @tparam T + * [inferred] The data type to be reduced. + * + * @tparam ReductionOp + * [inferred] Binary reduction operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * @param[in] reduction_op + * Binary reduction operator */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator +template +__device__ __forceinline__ T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op) { return ThreadReduce((T*) input, reduction_op); } diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh index bc3840ec9c8..c49777440db 100644 --- a/cub/cub/thread/thread_scan.cuh +++ b/cub/cub/thread/thread_scan.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Thread utilities for sequential prefix scan over statically-sized array types */ @@ -50,26 +50,32 @@ namespace internal { /** - * \addtogroup UtilModule + * @addtogroup UtilModule * @{ */ /** - * \name Sequential prefix scan over statically-sized array types + * @name Sequential prefix scan over statically-sized array types * @{ */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T inclusive, - T exclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) +/** + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + */ +template +__device__ __forceinline__ T ThreadScanExclusive(T inclusive, + T exclusive, + T *input, + T *output, + ScanOp scan_op, + Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) @@ -82,25 +88,40 @@ __device__ __forceinline__ T ThreadScanExclusive( return inclusive; } - - /** - * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential exclusive prefix scan over @p LENGTH elements of + * the @p input array, seeded with the specified @p prefix. The aggregate is returned. + * + * @tparam LENGTH + * LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] prefix + * Prefix to seed scan with + * + * @param[in] apply_prefix + * Whether or not the calling thread should apply its prefix. + * If not, the first output element is undefined. + * (Handy for preventing thread-0 from applying a prefix.) */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +template +__device__ __forceinline__ T +ThreadScanExclusive(T *input, T *output, ScanOp scan_op, T prefix, bool apply_prefix = true) { T inclusive = input[0]; if (apply_prefix) @@ -113,46 +134,59 @@ __device__ __forceinline__ T ThreadScanExclusive( return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); } - /** - * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential exclusive prefix scan over the statically-sized + * @p input array, seeded with the specified @p prefix. The aggregate is returned. + * + * @tparam LENGTH + * [inferred] LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] prefix + * Prefix to seed scan with + * + * @param[in] apply_prefix + * Whether or not the calling thread should apply its prefix. + * (Handy for preventing thread-0 from applying a prefix.) */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +template +__device__ __forceinline__ T ThreadScanExclusive(T (&input)[LENGTH], + T (&output)[LENGTH], + ScanOp scan_op, + T prefix, + bool apply_prefix = true) { - return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); + return ThreadScanExclusive((T *)input, (T *)output, scan_op, prefix, apply_prefix); } - - - - - - - - -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T inclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) +/** + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + */ +template +__device__ __forceinline__ T +ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type /*length*/) { #pragma unroll for (int i = 0; i < LENGTH; ++i) @@ -164,22 +198,31 @@ __device__ __forceinline__ T ThreadScanInclusive( return inclusive; } - /** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * @brief Perform a sequential inclusive prefix scan over + * @p LENGTH elements of the @p input array. The aggregate is returned. + * + * @tparam LENGTH + * LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. + * + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @param[in] scan_op + * Binary scan operator */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator +template +__device__ __forceinline__ T ThreadScanInclusive(T *input, T *output, ScanOp scan_op) { T inclusive = input[0]; output[0] = inclusive; @@ -188,44 +231,71 @@ __device__ __forceinline__ T ThreadScanInclusive( return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } - /** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * @brief Perform a sequential inclusive prefix scan over the + * statically-sized @p input array. The aggregate is returned. + * + * @tparam LENGTH + * [inferred] LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. + * + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator +template +__device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH], + T (&output)[LENGTH], + ScanOp scan_op) { return ThreadScanInclusive((T*) input, (T*) output, scan_op); } - /** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential inclusive prefix scan over + * @p LENGTH elements of the @p input array, seeded with the + * specified @p prefix. The aggregate is returned. * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @tparam LENGTH + * LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. + * + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] prefix + * Prefix to seed scan with + * + * @param[in] apply_prefix + * Whether or not the calling thread should apply its prefix. + * (Handy for preventing thread-0 from applying a prefix.) */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +template +__device__ __forceinline__ T +ThreadScanInclusive(T *input, T *output, ScanOp scan_op, T prefix, bool apply_prefix = true) { T inclusive = input[0]; if (apply_prefix) @@ -238,24 +308,43 @@ __device__ __forceinline__ T ThreadScanInclusive( return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); } - /** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * @brief Perform a sequential inclusive prefix scan over the + * statically-sized @p input array, seeded with the specified @p prefix. + * The aggregate is returned. + * + * @tparam LENGTH + * [inferred] LengthT of @p input and @p output arrays + * + * @tparam T + * [inferred] The data type to be scanned. + * + * @tparam ScanOp + * [inferred] Binary scan operator type having member + * T operator()(const T &a, const T &b) + * + * @param[in] input + * Input array + * + * @param[out] output + * Output array (may be aliased to @p input) + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] prefix + * Prefix to seed scan with * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * @param[in] apply_prefix + * Whether or not the calling thread should apply its prefix. + * (Handy for preventing thread-0 from applying a prefix.) */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +template +__device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH], + T (&output)[LENGTH], + ScanOp scan_op, + T prefix, + bool apply_prefix = true) { return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); } diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh index e18caf09a09..8d259aadfbe 100644 --- a/cub/cub/thread/thread_search.cuh +++ b/cub/cub/thread/thread_search.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Thread utilities for sequential search */ @@ -91,19 +91,21 @@ __host__ __device__ __forceinline__ void MergePathSearch( path_coordinate.y = diagonal - split_min; } - - /** - * \brief Returns the offset of the first value within \p input which does not compare less than \p val + * @brief Returns the offset of the first value within @p input which does not compare + * less than @p val + * + * @param[in] input + * Input sequence + * + * @param[in] num_items + * Input sequence length + * + * @param[in] val + * Search key */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT LowerBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key +template +__device__ __forceinline__ OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; while (num_items > 0) @@ -123,18 +125,21 @@ __device__ __forceinline__ OffsetT LowerBound( return retval; } - /** - * \brief Returns the offset of the first value within \p input which compares greater than \p val + * @brief Returns the offset of the first value within @p input which compares + * greater than @p val + * + * @param[in] input + * Input sequence + * + * @param[in] num_items + * Input sequence length + * + * @param[in] val + * Search key */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT UpperBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key +template +__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; while (num_items > 0) @@ -156,13 +161,18 @@ __device__ __forceinline__ OffsetT UpperBound( #if defined(__CUDA_FP16_TYPES_EXIST__) -template < - typename InputIteratorT, - typename OffsetT> -__device__ __forceinline__ OffsetT UpperBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - __half val) ///< [in] Search key +/** + * @param[in] input + * Input sequence + * + * @param[in] num_items + * Input sequence length + * + * @param[in] val + * Search key + */ +template +__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, __half val) { OffsetT retval = 0; while (num_items > 0) diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh index 9d24aa54ad9..bb2b4675827 100644 --- a/cub/cub/thread/thread_store.cuh +++ b/cub/cub/thread/thread_store.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * Thread utilities for writing memory using PTX cache modifiers. */ @@ -47,7 +47,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilIo + * @addtogroup UtilIo * @{ */ @@ -57,29 +57,29 @@ CUB_NAMESPACE_BEGIN //----------------------------------------------------------------------------- /** - * \brief Enumeration of cache modifiers for memory store operations. + * @brief Enumeration of cache modifiers for memory store operations. */ enum CacheStoreModifier { - STORE_DEFAULT, ///< Default (no modifier) - STORE_WB, ///< Cache write-back all coherent levels - STORE_CG, ///< Cache at global level - STORE_CS, ///< Cache streaming (likely to be accessed once) - STORE_WT, ///< Cache write-through (to system memory) - STORE_VOLATILE, ///< Volatile shared (any memory space) + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) }; - /** - * \name Thread I/O (cache modified) + * @name Thread I/O (cache modified) * @{ */ /** - * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * @brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. + * Can be used to store any data type. * - * \par Example - * \code + * @par Example + * @code * #include // or equivalently * * // 32-bit store using cache-global modifier: @@ -102,11 +102,16 @@ enum CacheStoreModifier * TestFoo *d_struct; * TestFoo val; * cub::ThreadStore(d_out + threadIdx.x, val); - * \endcode + * @endcode + * + * @tparam MODIFIER + * [inferred] CacheStoreModifier enumeration + * + * @tparam InputIteratorT + * [inferred] Output iterator type \iterator * - * \tparam MODIFIER [inferred] CacheStoreModifier enumeration - * \tparam InputIteratorT [inferred] Output iterator type \iterator - * \tparam T [inferred] Data type of output value + * @tparam T + * [inferred] Data type of output value */ template < CacheStoreModifier MODIFIER, diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh index 8172dd919f8..8e3978f18ce 100644 --- a/cub/cub/util_allocator.cuh +++ b/cub/cub/util_allocator.cuh @@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilMgmt + * @addtogroup UtilMgmt * @{ */ @@ -64,40 +64,40 @@ CUB_NAMESPACE_BEGIN ******************************************************************************/ /** - * \brief A simple caching allocator for device memory allocations. + * @brief A simple caching allocator for device memory allocations. * - * \par Overview + * @par Overview * The allocator is thread-safe and stream-safe and is capable of managing cached * device allocations on multiple devices. It behaves as follows: * - * \par - * - Allocations from the allocator are associated with an \p active_stream. Once freed, - * the allocation becomes available immediately for reuse within the \p active_stream + * @par + * - Allocations from the allocator are associated with an @p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the @p active_stream * with which it was associated with during allocation, and it becomes available for - * reuse within other streams when all prior work submitted to \p active_stream has completed. - * - Allocations are categorized and cached by bin size. A new allocation request of + * reuse within other streams when all prior work submitted to @p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of * a given size will only consider cached allocations within the corresponding bin. * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused device allocations within + * @p bin_growth provided during construction. Unused device allocations within * a larger bin cache are not reused for allocation requests that categorize to * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * - Allocation requests below ( @p bin_growth ^ @p min_bin ) are rounded up to + * ( @p bin_growth ^ @p min_bin ). + * - Allocations above ( @p bin_growth ^ @p max_bin ) are not rounded up to the nearest * bin and are simply freed when they are deallocated instead of being returned * to a bin-cache. * - If the total storage of cached allocations on a given device will exceed - * \p max_cached_bytes, allocations for that device are simply freed when they are + * @p max_cached_bytes, allocations for that device are simply freed when they are * deallocated instead of being returned to their bin-cache. * - * \par + * @par * For example, the default-constructed CachingDeviceAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B + * - @p bin_growth = 8 + * - @p min_bin = 3 + * - @p max_bin = 7 + * - @p max_cached_bytes = 6MB - 1B * - * \par + * @par * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB * and sets a maximum of 6,291,455 cached bytes per device * @@ -129,40 +129,52 @@ struct CachingDeviceAllocator */ struct BlockDescriptor { - void* d_ptr; // Device pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer and device) - BlockDescriptor(void *d_ptr, int device) : - d_ptr(d_ptr), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) + // Device pointer + void *d_ptr; + + // Size of allocation in bytes + size_t bytes; + + // Bin enumeration + unsigned int bin; + + // device ordinal + int device; + + // Associated associated_stream + cudaStream_t associated_stream; + + // Signal when associated stream has run to the point at which this block was freed + cudaEvent_t ready_event; + + // Constructor (suitable for searching maps for a specific block, given its pointer and + // device) + BlockDescriptor(void *d_ptr, int device) + : d_ptr(d_ptr) + , bytes(0) + , bin(INVALID_BIN) + , device(device) + , associated_stream(0) + , ready_event(0) {} // Constructor (suitable for searching maps for a range of suitable blocks, given a device) - BlockDescriptor(int device) : - d_ptr(NULL), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) + BlockDescriptor(int device) + : d_ptr(NULL) + , bytes(0) + , bin(INVALID_BIN) + , device(device) + , associated_stream(0) + , ready_event(0) {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { - if (a.device == b.device) - return (a.d_ptr < b.d_ptr); - else - return (a.device < b.device); + if (a.device == b.device) + return (a.d_ptr < b.d_ptr); + else + return (a.device < b.device); } // Comparison functor for comparing allocation sizes @@ -246,27 +258,46 @@ struct CachingDeviceAllocator } } - //--------------------------------------------------------------------- // Fields //--------------------------------------------------------------------- - std::mutex mutex; /// Mutex for thread-safety + /// Mutex for thread-safety + std::mutex mutex; + + /// Geometric growth factor for bin-sizes + unsigned int bin_growth; - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration + /// Minimum bin enumeration + unsigned int min_bin; - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + /// Maximum bin enumeration + unsigned int max_bin; - const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout + /// Minimum bin size + size_t min_bin_bytes; - GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device - CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse - BusyBlocks live_blocks; /// Set of live device allocations currently in use + /// Maximum bin size + size_t max_bin_bytes; + + /// Maximum aggregate cached bytes per device + size_t max_cached_bytes; + + /// Whether or not to skip a call to FreeAllCached() when destructor is called. + /// (The CUDA runtime may have already shut down for statically declared allocators) + const bool skip_cleanup; + + /// Whether or not to print (de)allocation events to stdout + bool debug; + + /// Map of device ordinal to aggregate cached bytes on that device + GpuCachedBytes cached_bytes; + + /// Set of cached device allocations available for reuse + CachedBlocks cached_blocks; + + /// Set of live device allocations currently in use + BusyBlocks live_blocks; #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -275,38 +306,55 @@ struct CachingDeviceAllocator //--------------------------------------------------------------------- /** - * \brief Constructor. + * @brief Constructor. + * + * @param bin_growth + * Geometric growth factor for bin-sizes + * + * @param min_bin + * Minimum bin (default is bin_growth ^ 1) + * + * @param max_bin + * Maximum bin (default is no max bin) + * + * @param max_cached_bytes + * Maximum aggregate cached bytes per device (default is no limit) + * + * @param skip_cleanup + * Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default + * is to deallocate) + * + * @param debug + * Whether or not to print (de)allocation events to stdout (default is no stderr output) */ - CachingDeviceAllocator( - unsigned int bin_growth, ///< Geometric growth factor for bin-sizes - unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) - bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : - bin_growth(bin_growth), - min_bin(min_bin), - max_bin(max_bin), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes(max_cached_bytes), - skip_cleanup(skip_cleanup), - debug(debug), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) + CachingDeviceAllocator(unsigned int bin_growth, + unsigned int min_bin = 1, + unsigned int max_bin = INVALID_BIN, + size_t max_cached_bytes = INVALID_SIZE, + bool skip_cleanup = false, + bool debug = false) + : bin_growth(bin_growth) + , min_bin(min_bin) + , max_bin(max_bin) + , min_bin_bytes(IntPow(bin_growth, min_bin)) + , max_bin_bytes(IntPow(bin_growth, max_bin)) + , max_cached_bytes(max_cached_bytes) + , skip_cleanup(skip_cleanup) + , debug(debug) + , cached_blocks(BlockDescriptor::SizeCompare) + , live_blocks(BlockDescriptor::PtrCompare) {} /** - * \brief Default constructor. + * @brief Default constructor. * * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * @par + * - @p bin_growth = 8 + * - @p min_bin = 3 + * - @p max_bin = 7 + * - @p max_cached_bytes = ( @p bin_growth ^ @p max_bin) * 3 ) - 1 = 6,291,455 bytes * * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and * sets a maximum of 6,291,455 cached bytes per device @@ -329,7 +377,7 @@ struct CachingDeviceAllocator /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * @brief Sets the limit on the number bytes this allocator is allowed to cache per device. * * Changing the ceiling of cached bytes does not cause any allocations (in-use or * cached-in-reserve) to be freed. See \p FreeAllCached(). @@ -349,19 +397,29 @@ struct CachingDeviceAllocator return cudaSuccess; } - /** - * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * @brief Provides a suitable allocation of device memory for the given size on the specified + * device. * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. + * Once freed, the allocation becomes available immediately for reuse within the @p + * active_stream with which it was associated with during allocation, and it becomes available + * for reuse within other streams when all prior work submitted to @p active_stream has + * completed. + * + * @param[in] device + * Device on which to place the allocation + * + * @param[out] d_ptr + * Reference to pointer to the allocation + * + * @param[in] bytes + * Minimum number of bytes for the allocation + * + * @param[in] active_stream + * The stream to be associated with this allocation */ - cudaError_t DeviceAllocate( - int device, ///< [in] Device on which to place the allocation - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + cudaError_t + DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream = 0) { *d_ptr = NULL; int entrypoint_device = INVALID_DEVICE_ORDINAL; @@ -571,29 +629,37 @@ struct CachingDeviceAllocator return error; } - /** - * \brief Provides a suitable allocation of device memory for the given size on the current device. + * @brief Provides a suitable allocation of device memory for the given size on the current + * device. * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. + * Once freed, the allocation becomes available immediately for reuse within the @p + * active_stream with which it was associated with during allocation, and it becomes available + * for reuse within other streams when all prior work submitted to @p active_stream has + * completed. + * + * @param[out] d_ptr + * Reference to pointer to the allocation + * + * @param[in] bytes + * Minimum number of bytes for the allocation + * + * @param[in] active_stream + * The stream to be associated with this allocation */ - cudaError_t DeviceAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation + cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream = 0) { return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); } - /** - * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * @brief Frees a live allocation of device memory on the specified device, returning it to the + * allocator. * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. + * Once freed, the allocation becomes available immediately for reuse within the + * @p active_stream with which it was associated with during allocation, and it becomes + * available for reuse within other streams when all prior work submitted to @p active_stream + * has completed. */ cudaError_t DeviceFree( int device, @@ -701,13 +767,14 @@ struct CachingDeviceAllocator return error; } - /** - * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. + * @brief Frees a live allocation of device memory on the current device, returning it to the + * allocator. * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. + * Once freed, the allocation becomes available immediately for reuse within the @p + * active_stream with which it was associated with during allocation, and it becomes available + * for reuse within other streams when all prior work submitted to @p active_stream has + * completed. */ cudaError_t DeviceFree( void* d_ptr) @@ -717,7 +784,7 @@ struct CachingDeviceAllocator /** - * \brief Frees all cached device allocations on all devices + * @brief Frees all cached device allocations on all devices */ cudaError_t FreeAllCached() { @@ -793,7 +860,7 @@ struct CachingDeviceAllocator /** - * \brief Destructor + * @brief Destructor */ virtual ~CachingDeviceAllocator() { diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index d8bb40cdcc4..37df1a5929c 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -566,12 +566,13 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) } // namespace detail /** - * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * @brief Computes maximum SM occupancy in thread blocks for executing the given kernel function + * pointer @p kernel_ptr on the current device with @p block_threads per thread block. * - * \par Snippet + * @par Snippet * The code snippet below illustrates the use of the MaxSmOccupancy function. - * \par - * \code + * @par + * @code * #include // or equivalently * * template @@ -593,16 +594,25 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva) * // max_sm_occupancy <-- 8 on SM20 * // max_sm_occupancy <-- 12 on SM35 * - * \endcode + * @endcode * + * @param[out] max_sm_occupancy + * maximum number of thread blocks that can reside on a single SM + * + * @param[in] kernel_ptr + * Kernel pointer for which to compute SM occupancy + * + * @param[in] block_threads + * Number of threads per thread block + * + * @param[in] dynamic_smem_bytes + * Dynamically allocated shared memory in bytes. Default is 0. */ template -CUB_RUNTIME_FUNCTION inline -cudaError_t MaxSmOccupancy( - int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM - KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy - int block_threads, ///< [in] Number of threads per thread block - int dynamic_smem_bytes = 0) ///< [in] Dynamically allocated shared memory in bytes. Default is 0. +CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy(int &max_sm_occupancy, + KernelPtr kernel_ptr, + int block_threads, + int dynamic_smem_bytes = 0) { return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_sm_occupancy, diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh index 4c5d01e5e54..5398adf50b4 100644 --- a/cub/cub/util_ptx.cuh +++ b/cub/cub/util_ptx.cuh @@ -514,24 +514,29 @@ __device__ __forceinline__ unsigned int LaneMaskGe() /** @} */ // end group UtilPtx - - - /** - * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) - * \ingroup WarpModule + * @brief Shuffle-up for any data type. + * Each warp-lanei obtains the value @p input contributed by + * warp-lanei-src_offset. + * For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread. + * ![](shfl_up_logo.png) * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type + * @ingroup WarpModule * - * \par + * @tparam LOGICAL_WARP_THREADS + * The number of threads per "logical" warp. Must be a power-of-two <= 32. + * + * @tparam T + * [inferred] The input/output element type + * + * @par * - Available only for SM3.0 or newer * - * \par Snippet + * @par Snippet * The code snippet below illustrates each thread obtaining a \p double value from the * predecessor of its predecessor. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -542,20 +547,27 @@ __device__ __forceinline__ unsigned int LaneMaskGe() * // Obtain item from two ranks below * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * @endcode + * @par + * Suppose the set of input @p thread_data across the first warp of threads is + * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. The corresponding output @p peer_data will be + * {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * + * @param[in] input + * The value to broadcast * + * @param[in] src_offset + * The relative down-offset of the peer to read from + * + * @param[in] first_thread + * Index of first lane in logical warp (typically 0) + * + * @param[in] member_mask + * 32-bit mask of participating warp lanes */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleUp( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative down-offset of the peer to read from - int first_thread, ///< [in] Index of first lane in logical warp (typically 0) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +template +__device__ __forceinline__ T +ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { @@ -584,22 +596,29 @@ __device__ __forceinline__ T ShuffleUp( return output; } - /** - * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) - * \ingroup WarpModule + * @brief Shuffle-down for any data type. + * Each warp-lanei obtains the value @p input contributed by + * warp-lanei+src_offset. + * For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the + * thread. ![](shfl_down_logo.png) * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type + * @ingroup WarpModule * - * \par + * @tparam LOGICAL_WARP_THREADS + * The number of threads per "logical" warp. Must be a power-of-two <= 32. + * + * @tparam T + * [inferred] The input/output element type + * + * @par * - Available only for SM3.0 or newer * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from the + * @par Snippet + * The code snippet below illustrates each thread obtaining a @p double value from the * successor of its successor. - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -610,20 +629,28 @@ __device__ __forceinline__ T ShuffleUp( * // Obtain item from two ranks below * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * @endcode + * @par + * Suppose the set of input @p thread_data across the first warp of threads is + * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output @p peer_data will be + * {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * + * @param[in] input + * The value to broadcast + * + * @param[in] src_offset + * The relative up-offset of the peer to read from * + * @param[in] last_thread + * Index of last thread in logical warp (typically 31 for a 32-thread warp) + * + * @param[in] member_mask + * 32-bit mask of participating warp lanes */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleDown( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative up-offset of the peer to read from - int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +template +__device__ __forceinline__ T +ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { @@ -652,25 +679,31 @@ __device__ __forceinline__ T ShuffleDown( return output; } - /** - * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input - * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, - * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * @brief Shuffle-broadcast for any data type. + * Each warp-lanei obtains the value @p input + * contributed by warp-lanesrc_lane. + * For @p src_lane < 0 or @p src_lane >= WARP_THREADS, + * then the thread's own @p input is returned to the thread. + * ![](shfl_broadcast_logo.png) * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type + * @tparam LOGICAL_WARP_THREADS + * The number of threads per "logical" warp. Must be a power-of-two <= 32. * - * \ingroup WarpModule + * @tparam T + * [inferred] The input/output element type * - * \par + * @ingroup WarpModule + * + * @par * - Available only for SM3.0 or newer * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. + * @par Snippet + * The code snippet below illustrates each thread obtaining a @p double value from + * warp-lane0. * - * \par - * \code + * @par + * @code * #include // or equivalently * * __global__ void ExampleKernel(...) @@ -681,19 +714,24 @@ __device__ __forceinline__ T ShuffleDown( * // Obtain item from thread 0 * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * @endcode + * @par + * Suppose the set of input @p thread_data across the first warp of threads is + * {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output @p peer_data will be + * {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * + * @param[in] input + * The value to broadcast + * + * @param[in] src_lane + * Which warp lane is to do the broadcasting * + * @param[in] member_mask + * 32-bit mask of participating warp lanes */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleIndex( - T input, ///< [in] The value to broadcast - int src_lane, ///< [in] Which warp lane is to do the broadcasting - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes +template +__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask) { /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up enum { diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh index 17548600a47..c10830c3457 100644 --- a/cub/cub/util_temporary_storage.cuh +++ b/cub/cub/util_temporary_storage.cuh @@ -48,22 +48,36 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \addtogroup UtilMgmt + * @addtogroup UtilMgmt * @{ */ #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** - * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of + * storage needed). + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When NULL, the required allocation size is written to @p temp_storage_bytes and no work is + * done. + * + * @param[in,out] temp_storage_bytes + * Size in bytes of @p d_temp_storage allocation + * + * @param[in,out] allocations + * Pointers to device allocations needed + * + * @param[in] allocation_sizes + * Sizes in bytes of device allocations needed */ template -__host__ __device__ __forceinline__ -cudaError_t AliasTemporaries( - void *d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation - void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed - size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +__host__ __device__ __forceinline__ cudaError_t +AliasTemporaries(void *d_temp_storage, + size_t &temp_storage_bytes, + void *(&allocations)[ALLOCATIONS], + size_t (&allocation_sizes)[ALLOCATIONS]) { constexpr int ALIGN_BYTES = 256; constexpr int ALIGN_MASK = ~(ALIGN_BYTES - 1); diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh index dd0e82bf0b2..965e0df9035 100644 --- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh @@ -27,7 +27,7 @@ ******************************************************************************/ /** - * \file + * @file * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. */ @@ -82,16 +82,20 @@ struct reduce_max_exists : ::cu } - /** - * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned + * across a CUDA thread warp. + * + * @tparam T + * Data type being reduced * - * LOGICAL_WARP_THREADS must be a power-of-two + * @tparam LOGICAL_WARP_THREADS + * Number of threads per logical warp (must be a power-of-two) + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct WarpReduceShfl { static_assert(PowerOfTwo::VALUE, @@ -167,12 +171,23 @@ struct WarpReduceShfl // Reduction steps //--------------------------------------------------------------------- - /// Reduction (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int ReduceStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Reduction (specialized for summation across uint32 types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ unsigned int + ReduceStep(unsigned int input, cub::Sum /*reduction_op*/, int last_lane, int offset) { unsigned int output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) @@ -191,13 +206,23 @@ struct WarpReduceShfl return output; } - - /// Reduction (specialized for summation across fp32 types) - __device__ __forceinline__ float ReduceStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Reduction (specialized for summation across fp32 types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ float + ReduceStep(float input, cub::Sum /*reduction_op*/, int last_lane, int offset) { float output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) @@ -216,13 +241,23 @@ struct WarpReduceShfl return output; } - - /// Reduction (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long ReduceStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Reduction (specialized for summation across unsigned long long types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ unsigned long long + ReduceStep(unsigned long long input, cub::Sum /*reduction_op*/, int last_lane, int offset) { unsigned long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) @@ -243,13 +278,23 @@ struct WarpReduceShfl return output; } - - /// Reduction (specialized for summation across long long types) - __device__ __forceinline__ long long ReduceStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Reduction (specialized for summation across long long types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ long long + ReduceStep(long long input, cub::Sum /*reduction_op*/, int last_lane, int offset) { long long output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) @@ -271,13 +316,23 @@ struct WarpReduceShfl return output; } - - /// Reduction (specialized for summation across double types) - __device__ __forceinline__ double ReduceStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Reduction (specialized for summation across double types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ double + ReduceStep(double input, cub::Sum /*reduction_op*/, int last_lane, int offset) { double output; int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) @@ -301,14 +356,28 @@ struct WarpReduceShfl return output; } - - /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) + /** + * @brief Reduction (specialized for swizzled ReduceByKeyOp across + * KeyValuePair types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + __device__ __forceinline__ KeyValuePair + ReduceStep(KeyValuePair input, + SwizzleScanOp> /*reduction_op*/, + int last_lane, + int offset) { KeyValuePair output; @@ -328,15 +397,28 @@ struct WarpReduceShfl return output; } - - - /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) + /** + * @brief Reduction (specialized for swizzled ReduceBySegmentOp across + * KeyValuePair types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + __device__ __forceinline__ KeyValuePair + ReduceStep(KeyValuePair input, + SwizzleScanOp> /*reduction_op*/, + int last_lane, + int offset) { KeyValuePair output; @@ -349,14 +431,24 @@ struct WarpReduceShfl return output; } - - /// Reduction step (generic) + /** + * @brief Reduction step (generic) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from + __device__ __forceinline__ _T + ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset) { _T output = input; @@ -369,28 +461,59 @@ struct WarpReduceShfl return output; } - - /// Reduction step (specialized for small unsigned integers size 32b or less) + /** + * @brief Reduction step (specialized for small unsigned integers size 32b or less) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + * + * @param[in] is_small_unsigned + * Marker type indicating whether T is a small unsigned integer + */ template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + __device__ __forceinline__ _T ReduceStep(_T input, + ReductionOp reduction_op, + int last_lane, + int offset, + Int2Type /*is_small_unsigned*/) { return ReduceStep(input, reduction_op, last_lane, offset); } - - /// Reduction step (specialized for types other than small unsigned integers size 32b or less) + /** + * @brief Reduction step (specialized for types other than small unsigned integers size + * 32b or less) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + * + * @param[in] offset + * Up-offset to pull from + * + * @param[in] is_small_unsigned + * Marker type indicating whether T is a small unsigned integer + */ template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + __device__ __forceinline__ _T ReduceStep(_T input, + ReductionOp reduction_op, + int last_lane, + int offset, + Int2Type /*is_small_unsigned*/) { return ReduceStep(input, reduction_op, last_lane, offset); } @@ -400,36 +523,62 @@ struct WarpReduceShfl // Templated reduction iteration //--------------------------------------------------------------------- + /** + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + */ template - __device__ __forceinline__ void ReduceStep( - T& input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - Int2Type /*step*/) + __device__ __forceinline__ void + ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type /*step*/) { input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); ReduceStep(input, reduction_op, last_lane, Int2Type()); } + /** + * @param[in] input + * Calling thread's input item. + * + * @param[in] reduction_op + * Binary reduction operator + * + * @param[in] last_lane + * Index of last lane in segment + */ template - __device__ __forceinline__ void ReduceStep( - T& /*input*/, ///< [in] Calling thread's input item. - ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator - int /*last_lane*/, ///< [in] Index of last lane in segment - Int2Type /*step*/) + __device__ __forceinline__ void ReduceStep(T & /*input*/, + ReductionOp /*reduction_op*/, + int /*last_lane*/, + Int2Type /*step*/) {} //--------------------------------------------------------------------- // Reduction operations //--------------------------------------------------------------------- + + /** + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Binary reduction operator + */ template - __device__ __forceinline__ T ReduceImpl( - Int2Type<0> /* all_lanes_valid */, - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator + __device__ __forceinline__ T ReduceImpl(Int2Type<0> /* all_lanes_valid */, + T input, + int valid_items, + ReductionOp reduction_op) { int last_lane = valid_items - 1; @@ -441,12 +590,21 @@ struct WarpReduceShfl return output; } + /** + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Binary reduction operator + */ template - __device__ __forceinline__ T ReduceImpl( - Int2Type<1> /* all_lanes_valid */, - T input, ///< [in] Calling thread's input - int /* valid_items */, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator + __device__ __forceinline__ T ReduceImpl(Int2Type<1> /* all_lanes_valid */, + T input, + int /* valid_items */, + ReductionOp reduction_op) { int last_lane = LOGICAL_WARP_THREADS - 1; @@ -524,29 +682,45 @@ struct WarpReduceShfl return output; } - /// Reduction - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator + /** + * @brief Reduction + * + * @tparam ALL_LANES_VALID + * Whether all lanes in each warp are contributing a valid fold of items + * + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Binary reduction operator + */ + template + __device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op) { return ReduceImpl( Int2Type{}, input, valid_items, reduction_op); } - - /// Segmented reduction - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Binary reduction operator + /** + * @brief Segmented reduction + * + * @tparam HEAD_SEGMENTED + * Whether flags indicate a segment-head or a segment-tail + * + * @param[in] input + * Calling thread's input + * + * @param[in] flag + * Whether or not the current lane is a segment head/tail + * + * @param[in] reduction_op + * Binary reduction operator + */ + template + __device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh index a2077940c7b..242f35d64da 100644 --- a/cub/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * @file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned + * across a CUDA thread warp. */ #pragma once @@ -49,12 +50,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned + * across a CUDA thread warp. + * + * @tparam T + * Data type being reduced + * + * @tparam LOGICAL_WARP_THREADS + * Number of threads per logical warp + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct WarpReduceSmem { /****************************************************************************** @@ -128,17 +136,23 @@ struct WarpReduceSmem //--------------------------------------------------------------------- /** - * Reduction step + * @brief Reduction step + * + * @tparam ALL_LANES_VALID + * Whether all lanes in each warp are contributing a valid fold of items + * + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Reduction operator */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp, - int STEP> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*step*/) + template + __device__ __forceinline__ T + ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type /*step*/) { constexpr int OFFSET = 1 << STEP; @@ -159,18 +173,24 @@ struct WarpReduceSmem return ReduceStep(input, valid_items, reduction_op, Int2Type()); } - /** - * Reduction step (terminate) + * @brief Reduction step (terminate) + * + * @tparam ALL_LANES_VALID + * Whether all lanes in each warp are contributing a valid fold of items + * + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Reduction operator */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp /*reduction_op*/, ///< [in] Reduction operator - Int2Type /*step*/) + template + __device__ __forceinline__ T + ReduceStep(T input, int valid_items, ReductionOp /*reduction_op*/, Int2Type /*step*/) { return input; } @@ -180,19 +200,27 @@ struct WarpReduceSmem // Segmented reduction //--------------------------------------------------------------------- - /** - * Ballot-based segmented reduce + * @brief Ballot-based segmented reduce + * + * @tparam HEAD_SEGMENTED + * Whether flags indicate a segment-head or a segment-tail + * + * @param[in] input + * Calling thread's input + * + * @param[in] flag + * Whether or not the current lane is a segment head/tail + * + * @param[in] reduction_op + * Reduction operator + * + * @param[in] has_ballot + * Marker type for whether the target arch has ballot functionality */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + template + __device__ __forceinline__ T + SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); @@ -239,19 +267,27 @@ struct WarpReduceSmem return input; } - /** - * Smem-based segmented reduce + * @brief Smem-based segmented reduce + * + * @tparam HEAD_SEGMENTED + * Whether flags indicate a segment-head or a segment-tail + * + * @param[in] input + * Calling thread's input + * + * @param[in] flag + * Whether or not the current lane is a segment head/tail + * + * @param[in] reduction_op + * Reduction operator + * + * @param[in] has_ballot + * Marker type for whether the target arch has ballot functionality */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + template + __device__ __forceinline__ T + SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type /*has_ballot*/) { enum { @@ -331,31 +367,43 @@ struct WarpReduceSmem ******************************************************************************/ /** - * Reduction + * @brief Reduction + * + * @tparam ALL_LANES_VALID + * Whether all lanes in each warp are contributing a valid fold of items + * + * @param[in] input + * Calling thread's input + * + * @param[in] valid_items + * Total number of valid items across the logical warp + * + * @param[in] reduction_op + * Reduction operator */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Reduction operator + template + __device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op) { return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); } - /** - * Segmented reduction + * @brief Segmented reduction + * + * @tparam HEAD_SEGMENTED + * Whether flags indicate a segment-head or a segment-tail + * + * @param[in] input + * Calling thread's input + * + * @param[in] flag + * Whether or not the current lane is a segment head/tail + * + * @param[in] reduction_op + * Reduction operator */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Reduction operator + template + __device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op) { return SegmentedReduce(input, flag, reduction_op, Int2Type()); } diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh index 81db566c184..85550412d9f 100644 --- a/cub/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * @file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned + * across a CUDA thread warp. */ #pragma once @@ -48,14 +49,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned + * across a CUDA thread warp. * - * LOGICAL_WARP_THREADS must be a power-of-two + * @tparam T + * Data type being scanned + * + * @tparam LOGICAL_WARP_THREADS + * Number of threads per logical warp (must be a power-of-two) + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct WarpScanShfl { //--------------------------------------------------------------------- @@ -122,12 +128,23 @@ struct WarpScanShfl // Inclusive scan steps //--------------------------------------------------------------------- - /// Inclusive prefix scan step (specialized for summation across int32 types) - __device__ __forceinline__ int InclusiveScanStep( - int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across int32 types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ int + InclusiveScanStep(int input, cub::Sum /*scan_op*/, int first_lane, int offset) { int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -146,12 +163,23 @@ struct WarpScanShfl return output; } - /// Inclusive prefix scan step (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int InclusiveScanStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across uint32 types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ unsigned int + InclusiveScanStep(unsigned int input, cub::Sum /*scan_op*/, int first_lane, int offset) { unsigned int output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -170,13 +198,23 @@ struct WarpScanShfl return output; } - - /// Inclusive prefix scan step (specialized for summation across fp32 types) - __device__ __forceinline__ float InclusiveScanStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across fp32 types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ float + InclusiveScanStep(float input, cub::Sum /*scan_op*/, int first_lane, int offset) { float output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -195,13 +233,23 @@ struct WarpScanShfl return output; } - - /// Inclusive prefix scan step (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long InclusiveScanStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across unsigned long long types) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ unsigned long long + InclusiveScanStep(unsigned long long input, cub::Sum /*scan_op*/, int first_lane, int offset) { unsigned long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -225,13 +273,23 @@ struct WarpScanShfl return output; } - - /// Inclusive prefix scan step (specialized for summation across long long types) - __device__ __forceinline__ long long InclusiveScanStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across long long types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ long long + InclusiveScanStep(long long input, cub::Sum /*scan_op*/, int first_lane, int offset) { long long output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -255,13 +313,23 @@ struct WarpScanShfl return output; } - - /// Inclusive prefix scan step (specialized for summation across fp64 types) - __device__ __forceinline__ double InclusiveScanStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + /** + * @brief Inclusive prefix scan step (specialized for summation across fp64 types) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ + __device__ __forceinline__ double + InclusiveScanStep(double input, cub::Sum /*scan_op*/, int first_lane, int offset) { double output; int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) @@ -307,13 +375,24 @@ struct WarpScanShfl } */ - /// Inclusive prefix scan step (generic) + /** + * @brief Inclusive prefix scan step (generic) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + */ template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from + __device__ __forceinline__ _T + InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset) { _T temp = ShuffleUp(input, offset, first_lane, member_mask); @@ -325,28 +404,59 @@ struct WarpScanShfl return output; } - - /// Inclusive prefix scan step (specialized for small integers size 32b or less) + /** + * @brief Inclusive prefix scan step (specialized for small integers size 32b or less) + * + * @param[in] input + * Calling thread's input item + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + * + * @param[in] is_small_unsigned + * Marker type indicating whether T is a small integer + */ template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + __device__ __forceinline__ _T InclusiveScanStep(_T input, + ScanOpT scan_op, + int first_lane, + int offset, + Int2Type /*is_small_unsigned*/) { return InclusiveScanStep(input, scan_op, first_lane, offset); } - - /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) + /** + * @brief Inclusive prefix scan step (specialized for types other than small integers size + * 32b or less) + * + * @param[in] input + * Calling thread's input item. + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] first_lane + * Index of first lane in segment + * + * @param[in] offset + * Up-offset to pull from + * + * @param[in] is_small_unsigned + * Marker type indicating whether T is a small integer + */ template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + __device__ __forceinline__ _T InclusiveScanStep(_T input, + ScanOpT scan_op, + int first_lane, + int offset, + Int2Type /*is_small_unsigned*/) { return InclusiveScanStep(input, scan_op, first_lane, offset); } @@ -360,10 +470,16 @@ struct WarpScanShfl // Broadcast //--------------------------------------------------------------------- - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - int src_lane) ///< [in] Which warp lane is to do the broadcasting + /** + * @brief Broadcast + * + * @param[in] input + * The value to broadcast + * + * @param[in] src_lane + * Which warp lane is to do the broadcasting + */ + __device__ __forceinline__ T Broadcast(T input, int src_lane) { return ShuffleIndex(input, src_lane, member_mask); } @@ -373,12 +489,20 @@ struct WarpScanShfl // Inclusive operations //--------------------------------------------------------------------- - /// Inclusive scan + /** + * @brief Inclusive scan + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void InclusiveScan( - _T input, ///< [in] Calling thread's input item. - _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void InclusiveScan(_T input, _T &inclusive_output, ScanOpT scan_op) { inclusive_output = input; @@ -399,12 +523,22 @@ struct WarpScanShfl } - /// Inclusive scan, specialized for reduce-value-by-key + /** + * @brief Inclusive scan, specialized for reduce-value-by-key + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void InclusiveScan( - KeyValuePair input, ///< [in] Calling thread's input item. - KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ReduceByKeyOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void InclusiveScan(KeyValuePair input, + KeyValuePair &inclusive_output, + ReduceByKeyOp scan_op) { inclusive_output = input; @@ -431,14 +565,24 @@ struct WarpScanShfl } } - - /// Inclusive scan with aggregate + /** + * @brief Inclusive scan with aggregate + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] warp_aggregate + * Warp-wide aggregate reduction of input items + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + __device__ __forceinline__ void + InclusiveScan(T input, T &inclusive_output, ScanOpT scan_op, T &warp_aggregate) { InclusiveScan(input, inclusive_output, scan_op); @@ -451,20 +595,31 @@ struct WarpScanShfl // Get exclusive from inclusive //--------------------------------------------------------------------- - /// Update inclusive and exclusive using input and inclusive + /** + * @brief Update inclusive and exclusive using input and inclusive + * + * @param[in] input + * + * @param[out] inclusive + * + * @param[out] exclusive + * + * @param[in] scan_op + * + * @param[in] is_integer + */ template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] + __device__ __forceinline__ void + Update(T /*input*/, T &inclusive, T &exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // initial value unknown exclusive = ShuffleUp(inclusive, 1, 0, member_mask); } - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + /** + * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of + * integer types) + */ __device__ __forceinline__ void Update( T input, T &inclusive, @@ -476,7 +631,10 @@ struct WarpScanShfl exclusive = inclusive - input; } - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + /** + * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial + * value + */ template __device__ __forceinline__ void Update ( T /*input*/, @@ -493,7 +651,10 @@ struct WarpScanShfl exclusive = initial_value; } - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + /** + * @brief Update inclusive and exclusive using initial value using input and inclusive + * (specialized for summation of integer types) + */ __device__ __forceinline__ void Update ( T input, T &inclusive, @@ -506,8 +667,9 @@ struct WarpScanShfl exclusive = inclusive - input; } - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive + /** + * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive + */ template __device__ __forceinline__ void Update ( T input, @@ -521,7 +683,10 @@ struct WarpScanShfl Update(input, inclusive, exclusive, scan_op, is_integer); } - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + /** + * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial + * value + */ template __device__ __forceinline__ void Update ( T input, diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh index 8f76b3c6253..fb90fe06992 100644 --- a/cub/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/cub/warp/specializations/warp_scan_smem.cuh @@ -27,8 +27,9 @@ ******************************************************************************/ /** - * \file - * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * @file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned + * across a CUDA thread warp. */ #pragma once @@ -49,12 +50,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER CUB_NAMESPACE_BEGIN /** - * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned + * across a CUDA thread warp. + * + * @tparam T + * Data type being scanned + * + * @tparam LOGICAL_WARP_THREADS + * Number of threads per logical warp + * + * @tparam LEGACY_PTX_ARCH + * The PTX compute capability for which to to specialize this collective */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective +template struct WarpScanSmem { /****************************************************************************** @@ -157,13 +165,23 @@ struct WarpScanSmem Int2Type /*step*/) {} - - /// Inclusive prefix scan (specialized for summation across primitive types) - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + /** + * @brief Inclusive prefix scan (specialized for summation across primitive types) + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] + * Marker type indicating whether T is primitive type + */ + __device__ __forceinline__ void + InclusiveScan(T input, T &output, Sum scan_op, Int2Type /*is_primitive*/) { T identity = 0; ThreadStore(&temp_storage[lane_id], (CellT) identity); @@ -175,14 +193,24 @@ struct WarpScanSmem ScanStep(output, scan_op, Int2Type<0>()); } - - /// Inclusive prefix scan + /** + * @brief Inclusive prefix scan + * + * @param[in] input + * Calling thread's input item + * + * @param[out] output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + * + * @param[in] is_primitive + * Marker type indicating whether T is primitive type + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + __device__ __forceinline__ void + InclusiveScan(T input, T &output, ScanOp scan_op, Int2Type /*is_primitive*/) { // Iterate scan steps output = input; @@ -198,10 +226,16 @@ struct WarpScanSmem // Broadcast //--------------------------------------------------------------------- - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + /** + * @brief Broadcast + * + * @param[in] input + * The value to broadcast + * + * @param[in] src_lane + * Which warp lane is to do the broadcasting + */ + __device__ __forceinline__ T Broadcast(T input, unsigned int src_lane) { if (lane_id == src_lane) { @@ -218,24 +252,42 @@ struct WarpScanSmem // Inclusive operations //--------------------------------------------------------------------- - /// Inclusive scan + /** + * @brief Inclusive scan + * + * @param[in] input + * Calling thread's input item. + * + * @param[out] inclusive_output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator + __device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op) { InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); } - - /// Inclusive scan with aggregate + /** + * @brief Inclusive scan with aggregate + * + * @param[in] input + * Calling thread's input item + * + * @param[out] inclusive_output + * Calling thread's output item. May be aliased with @p input + * + * @param[in] scan_op + * Binary scan operator + * + * @param[out] warp_aggregate + * Warp-wide aggregate reduction of input items. + */ template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + __device__ __forceinline__ void + InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate) { InclusiveScan(input, inclusive_output, scan_op); @@ -254,14 +306,22 @@ struct WarpScanSmem // Get exclusive from inclusive //--------------------------------------------------------------------- - /// Update inclusive and exclusive using input and inclusive + /** + * @brief Update inclusive and exclusive using input and inclusive + * + * @param[in] input + * + * @param[in, out] inclusive + * + * @param[out] exclusive + * + * @param[in] scan_op + * + * @param[in] is_integer + */ template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] + __device__ __forceinline__ void + Update(T /*input*/, T &inclusive, T &exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/) { // initial value unknown ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); @@ -271,7 +331,10 @@ struct WarpScanSmem exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); } - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + /** + * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of + * integer types) + */ __device__ __forceinline__ void Update( T input, T &inclusive, @@ -283,7 +346,10 @@ struct WarpScanSmem exclusive = inclusive - input; } - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + /** + * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial + * value + */ template __device__ __forceinline__ void Update ( T /*input*/, @@ -303,7 +369,10 @@ struct WarpScanSmem exclusive = initial_value; } - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + /** + * @brief Update inclusive and exclusive using initial value using input and inclusive + * (specialized for summation of integer types) + */ __device__ __forceinline__ void Update ( T input, T &inclusive, @@ -316,8 +385,9 @@ struct WarpScanSmem exclusive = inclusive - input; } - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive + /** + * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive + */ template __device__ __forceinline__ void Update ( T /*input*/, @@ -336,7 +406,10 @@ struct WarpScanSmem warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); } - /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) + /** + * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized + * for summation of integer types) + */ __device__ __forceinline__ void Update ( T input, T &inclusive, @@ -354,7 +427,10 @@ struct WarpScanSmem exclusive = inclusive - input; } - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + /** + * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial + * value + */ template __device__ __forceinline__ void Update ( T /*input*/,