From 97dcbe1aa6f06c068fb1be72274b07725595fcdb Mon Sep 17 00:00:00 2001
From: Georgy Evtushenko <evtushenko.georgy@gmail.com>
Date: Wed, 11 Oct 2023 21:23:34 +0000
Subject: [PATCH] Refactor inline comments

---
 cub/cub/agent/agent_histogram.cuh             |  320 ++-
 cub/cub/agent/agent_radix_sort_downsweep.cuh  |  104 +-
 cub/cub/agent/agent_radix_sort_upsweep.cuh    |   79 +-
 cub/cub/agent/agent_rle.cuh                   |  238 ++-
 cub/cub/agent/agent_segment_fixup.cuh         |  236 ++-
 cub/cub/agent/agent_select_if.cuh             |  360 ++--
 cub/cub/agent/agent_spmv_orig.cuh             |  277 ++-
 cub/cub/agent/agent_unique_by_key.cuh         |  134 +-
 cub/cub/agent/single_pass_scan_operators.cuh  |  253 ++-
 cub/cub/block/block_adjacent_difference.cuh   |  339 ++--
 cub/cub/block/block_discontinuity.cuh         |  756 ++++---
 cub/cub/block/block_exchange.cuh              |  598 ++++--
 cub/cub/block/block_histogram.cuh             |  214 +-
 cub/cub/block/block_load.cuh                  | 1183 +++++++----
 cub/cub/block/block_radix_rank.cuh            |  278 ++-
 cub/cub/block/block_radix_sort.cuh            |  469 +++--
 cub/cub/block/block_raking_layout.cuh         |   31 +-
 cub/cub/block/block_reduce.cuh                |  335 ++--
 cub/cub/block/block_run_length_decode.cuh     |   94 +-
 cub/cub/block/block_scan.cuh                  | 1776 ++++++++++-------
 cub/cub/block/block_shuffle.cuh               |  184 +-
 cub/cub/block/block_store.cuh                 |  769 ++++---
 .../block_histogram_atomic.cuh                |   43 +-
 .../specializations/block_histogram_sort.cuh  |   65 +-
 .../specializations/block_reduce_raking.cuh   |  129 +-
 .../block_reduce_raking_commutative_only.cuh  |   91 +-
 .../block_reduce_warp_reductions.cuh          |  148 +-
 .../specializations/block_scan_raking.cuh     |  351 +++-
 .../specializations/block_scan_warp_scans.cuh |  343 +++-
 cub/cub/device/device_spmv.cuh                |  103 +-
 .../device/dispatch/dispatch_radix_sort.cuh   |  849 +++++---
 .../device/dispatch/dispatch_spmv_orig.cuh    |  330 ++-
 .../dispatch/dispatch_unique_by_key.cuh       |  321 ++-
 cub/cub/grid/grid_even_share.cuh              |   89 +-
 cub/cub/grid/grid_queue.cuh                   |   49 +-
 cub/cub/iterator/arg_index_input_iterator.cuh |   96 +-
 .../cache_modified_input_iterator.cuh         |   62 +-
 .../cache_modified_output_iterator.cuh        |   72 +-
 cub/cub/iterator/constant_input_iterator.cuh  |   81 +-
 cub/cub/iterator/counting_input_iterator.cuh  |   72 +-
 cub/cub/iterator/discard_output_iterator.cuh  |   51 +-
 cub/cub/iterator/tex_obj_input_iterator.cuh   |   71 +-
 cub/cub/iterator/tex_ref_input_iterator.cuh   |   35 +-
 cub/cub/iterator/transform_input_iterator.cuh |   91 +-
 cub/cub/thread/thread_load.cuh                |   40 +-
 cub/cub/thread/thread_reduce.cuh              |  176 +-
 cub/cub/thread/thread_scan.cuh                |  329 +--
 cub/cub/thread/thread_search.cuh              |   68 +-
 cub/cub/thread/thread_store.cuh               |   41 +-
 cub/cub/util_allocator.cuh                    |  297 +--
 cub/cub/util_device.cuh                       |   32 +-
 cub/cub/util_ptx.cuh                          |  174 +-
 cub/cub/util_temporary_storage.cuh            |   30 +-
 .../warp/specializations/warp_reduce_shfl.cuh |  410 ++--
 .../warp/specializations/warp_reduce_smem.cuh |  182 +-
 .../warp/specializations/warp_scan_shfl.cuh   |  373 +++-
 .../warp/specializations/warp_scan_smem.cuh   |  178 +-
 57 files changed, 9832 insertions(+), 5067 deletions(-)

diff --git a/cub/cub/agent/agent_histogram.cuh b/cub/cub/agent/agent_histogram.cuh
index c4226ba913e..44de7891d18 100644
--- a/cub/cub/agent/agent_histogram.cuh
+++ b/cub/cub/agent/agent_histogram.cuh
@@ -65,55 +65,122 @@ enum BlockHistogramMemoryPreference
     BLEND
 };
 
-
 /**
  * Parameterizable tuning policy type for AgentHistogram
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _PIXELS_PER_THREAD
+ *   Pixels per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _RLE_COMPRESS
+ *   Whether to perform localized RLE to compress samples before histogramming
+ *
+ * @tparam _MEM_PREFERENCE
+ *   Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+ *
+ * @tparam _WORK_STEALING
+ *   Whether to dequeue tiles from a global work queue
+ *
+ * @tparam _VEC_SIZE
+ *   Vector size for samples loading (1, 2, 4)
  */
-template <
-    int                             _BLOCK_THREADS,                 ///< Threads per thread block
-    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
-    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
-    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-    bool                            _WORK_STEALING,                 ///< Whether to dequeue tiles from a global work queue
-    int                             _VEC_SIZE = 4>                  ///< Vector size for samples loading (1, 2, 4)
+template <int _BLOCK_THREADS,
+          int _PIXELS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          bool _RLE_COMPRESS,
+          BlockHistogramMemoryPreference _MEM_PREFERENCE,
+          bool _WORK_STEALING,
+          int _VEC_SIZE = 4>
 struct AgentHistogramPolicy
 {
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
-        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
-        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
-        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
-        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
-    };
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
 
-    static constexpr int VEC_SIZE = _VEC_SIZE;                      ///< Vector size for samples loading (1, 2, 4)
+    /// Pixels per thread (per tile of input)
+    PIXELS_PER_THREAD = _PIXELS_PER_THREAD,
 
-    static constexpr BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-};
+    /// Whether to perform localized RLE to compress samples before histogramming
+    IS_RLE_COMPRESS = _RLE_COMPRESS,
+
+    /// Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    MEM_PREFERENCE = _MEM_PREFERENCE,
+
+    /// Whether to dequeue tiles from a global work queue
+    IS_WORK_STEALING = _WORK_STEALING,
+  };
 
+  /// Vector size for samples loading (1, 2, 4)
+  static constexpr int VEC_SIZE = _VEC_SIZE;
+
+  ///< The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  ///< Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
 /**
- * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ * @brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating
+ * in device-wide histogram .
+ *
+ * @tparam AgentHistogramPolicyT
+ *   Parameterized AgentHistogramPolicy tuning policy type
+ *
+ * @tparam PRIVATIZED_SMEM_BINS
+ *   Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized
+ * counters to be maintained in device-accessible memory.
+ *
+ * @tparam NUM_CHANNELS
+ *   Number of channels interleaved in the input data.  Supports up to four channels.
+ *
+ * @tparam NUM_ACTIVE_CHANNELS
+ *   Number of channels actively being histogrammed
+ *
+ * @tparam SampleIteratorT
+ *   Random-access input iterator type for reading samples
+ *
+ * @tparam CounterT
+ *   Integer type for counting sample occurrences per histogram bin
+ *
+ * @tparam PrivatizedDecodeOpT
+ *   The transform operator type for determining privatized counter indices from samples, one for
+ * each channel
+ *
+ * @tparam OutputDecodeOpT
+ *   The transform operator type for determining output bin-ids from privatized counter indices, one
+ * for each channel
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   PTX compute capability (unused)
  */
-template <
-    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
-    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
-    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
-    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
-    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
-    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
-    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    int         LEGACY_PTX_ARCH = 0>            ///< PTX compute capability (unused)
+template <typename AgentHistogramPolicyT,
+          int PRIVATIZED_SMEM_BINS,
+          int NUM_CHANNELS,
+          int NUM_ACTIVE_CHANNELS,
+          typename SampleIteratorT,
+          typename CounterT,
+          typename PrivatizedDecodeOpT,
+          typename OutputDecodeOpT,
+          typename OffsetT,
+          int LEGACY_PTX_ARCH = 0>
 struct AgentHistogram
 {
     //---------------------------------------------------------------------
@@ -198,16 +265,22 @@ struct AgentHistogram
     /// Shared memory type required by this thread block
     struct _TempStorage
     {
-        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+        // Smem needed for block-privatized smem histogram (with 1 word of padding)
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     
 
         int tile_idx;
 
         // Aliasable storage layout
         union Aliasable
         {
-            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
-            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
-            typename BlockLoadVecT::TempStorage vec_load;           // Smem needed for loading a tile of vecs
+            // Smem needed for loading a tile of samples
+            typename BlockLoadSampleT::TempStorage sample_load;     
+
+            // Smem needed for loading a tile of pixels
+            typename BlockLoadPixelT::TempStorage pixel_load;       
+
+            // Smem needed for loading a tile of vecs
+            typename BlockLoadVecT::TempStorage vec_load;
 
         } aliasable;
     };
@@ -575,10 +648,16 @@ struct AgentHistogram
     // Tile processing
     //---------------------------------------------------------------------
 
-    // Consume a tile of data samples
-    template <
-        bool IS_ALIGNED,        // Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
-        bool IS_FULL_TILE>      // Whether the tile is full
+    /**
+     * @brief Consume a tile of data samples
+     * 
+     * @tparam IS_ALIGNED
+     *   Whether the tile offset is aligned (vec-aligned for single-channel, pixel-aligned for multi-channel)
+     *
+     * @tparam IS_FULL_TILE
+        Whether the tile is full
+     */
+    template <bool IS_ALIGNED, bool IS_FULL_TILE>
     __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
     {
         SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
@@ -610,15 +689,28 @@ struct AgentHistogram
     }
 
 
-    // Consume row tiles.  Specialized for work-stealing from queue
+    /**
+     * @brief Consume row tiles. Specialized for work-stealing from queue
+     * 
+     * @param num_row_pixels 
+     *   The number of multi-channel pixels per row in the region of interest 
+     *
+     * @param num_rows 
+     *   The number of rows in the region of interest
+     *
+     * @param row_stride_samples 
+     *   The number of samples between starts of consecutive rows in the region of interest
+     *
+     * @param tiles_per_row 
+     *   Number of image tiles per row
+     */
     template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<true>      is_work_stealing)
+    __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels,
+                                                 OffsetT num_rows,
+                                                 OffsetT row_stride_samples,
+                                                 int tiles_per_row,
+                                                 GridQueue<int> tile_queue,
+                                                 Int2Type<true> is_work_stealing)
     {
 
         int         num_tiles                   = num_rows * tiles_per_row;
@@ -658,15 +750,28 @@ struct AgentHistogram
     }
 
 
-    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    /**
+     * @brief Consume row tiles.  Specialized for even-share (striped across thread blocks)
+     * 
+     * @param num_row_pixels 
+     *   The number of multi-channel pixels per row in the region of interest
+     *
+     * @param num_rows 
+     *   The number of rows in the region of interest
+     *
+     * @param row_stride_samples 
+     *   The number of samples between starts of consecutive rows in the region of interest
+     *
+     * @param tiles_per_row 
+     *   Number of image tiles per row
+     */
     template <bool IS_ALIGNED>
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue,
-        Int2Type<false>     is_work_stealing)
+    __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels,
+                                                 OffsetT num_rows,
+                                                 OffsetT row_stride_samples,
+                                                 int tiles_per_row,
+                                                 GridQueue<int> tile_queue,
+                                                 Int2Type<false> is_work_stealing)
     {
         for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
         {
@@ -722,31 +827,53 @@ struct AgentHistogram
 
 
     /**
-     * Constructor
+     * @brief Constructor
+     *
+     * @param temp_storage 
+     *   Reference to temp_storage
+     *
+     * @param d_samples 
+     *   Input data to reduce
+     *
+     * @param num_output_bins
+     *   The number bins per final output histogram
+     *
+     * @param num_privatized_bins
+     *   The number bins per privatized histogram
+     *
+     * @param d_output_histograms
+     *   Reference to final output histograms
+     *
+     * @param d_privatized_histograms
+     *   Reference to privatized histograms
+     *
+     * @param output_decode_op
+     *   The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+     *
+     * @param privatized_decode_op
+     *   The transform operator for determining privatized counter indices from samples, one for each channel
      */
-    __device__ __forceinline__ AgentHistogram(
-        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
-        SampleIteratorT     d_samples,                                          ///< Input data to reduce
-        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
-        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
-        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
-        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
-        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
-        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
-    :
-        temp_storage(temp_storage.Alias()),
-        d_wrapped_samples(d_samples),
-        d_native_samples(NativePointer(d_wrapped_samples)),
-        num_output_bins(num_output_bins),
-        num_privatized_bins(num_privatized_bins),
-        d_output_histograms(d_output_histograms),
-        output_decode_op(output_decode_op),
-        privatized_decode_op(privatized_decode_op),
-        prefer_smem((MEM_PREFERENCE == SMEM) ?
-            true :                              // prefer smem privatized histograms
-            (MEM_PREFERENCE == GMEM) ?
-                false :                         // prefer gmem privatized histograms
-                blockIdx.x & 1)                 // prefer blended privatized histograms
+    __device__ __forceinline__
+    AgentHistogram(TempStorage &temp_storage,
+                   SampleIteratorT d_samples,
+                   int (&num_output_bins)[NUM_ACTIVE_CHANNELS],
+                   int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],
+                   CounterT *(&d_output_histograms)[NUM_ACTIVE_CHANNELS],
+                   CounterT *(&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],
+                   OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS],
+                   PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])
+        : temp_storage(temp_storage.Alias())
+        , d_wrapped_samples(d_samples)
+        , d_native_samples(NativePointer(d_wrapped_samples))
+        , num_output_bins(num_output_bins)
+        , num_privatized_bins(num_privatized_bins)
+        , d_output_histograms(d_output_histograms)
+        , output_decode_op(output_decode_op)
+        , privatized_decode_op(privatized_decode_op)
+        , prefer_smem((MEM_PREFERENCE == SMEM) ? true : // prefer smem privatized histograms
+                        (MEM_PREFERENCE == GMEM) ? false
+                                                 : // prefer gmem privatized histograms
+                        blockIdx.x & 1)            // prefer blended privatized histograms
     {
         int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
 
@@ -755,16 +882,29 @@ struct AgentHistogram
             this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
     }
 
-
     /**
-     * Consume image
+     * @brief Consume image
+     *
+     * @param num_row_pixels
+     *   The number of multi-channel pixels per row in the region of interest
+     *
+     * @param num_rows
+     *   The number of rows in the region of interest
+     *
+     * @param row_stride_samples
+     *   The number of samples between starts of consecutive rows in the region of interest
+     *
+     * @param tiles_per_row
+     *   Number of image tiles per row
+     *
+     * @param tile_queue
+     *   Queue descriptor for assigning tiles of work to thread blocks
      */
-    __device__ __forceinline__ void ConsumeTiles(
-        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
-        OffsetT             num_rows,                   ///< The number of rows in the region of interest
-        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
-        int                 tiles_per_row,              ///< Number of image tiles per row
-        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    __device__ __forceinline__ void ConsumeTiles(OffsetT num_row_pixels,
+                                                 OffsetT num_rows,
+                                                 OffsetT row_stride_samples,
+                                                 int tiles_per_row,
+                                                 GridQueue<int> tile_queue)
     {
         // Check whether all row starting offsets are vec-aligned (in single-channel) or pixel-aligned (in multi-channel)
         int     vec_mask           = AlignBytes<VecT>::ALIGN_BYTES - 1;
diff --git a/cub/cub/agent/agent_radix_sort_downsweep.cuh b/cub/cub/agent/agent_radix_sort_downsweep.cuh
index a28bbdfa597..d893d836ee1 100644
--- a/cub/cub/agent/agent_radix_sort_downsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_downsweep.cuh
@@ -63,30 +63,61 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ * @brief Parameterizable tuning policy type for AgentRadixSortDownsweep
+ *
+ * @tparam NOMINAL_BLOCK_THREADS_4B
+ *   Threads per thread block
+ *
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B
+ *   Items per thread (per tile of input)
+ *
+ * @tparam ComputeT
+ *   Dominant compute type
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading keys (and values)
+ *
+ * @tparam _RANK_ALGORITHM
+ *   The radix ranking algorithm to use
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The block scan algorithm to use
+ *
+ * @tparam _RADIX_BITS
+ *   The number of radix bits, i.e., log2(bins)
  */
-template <
-    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename            ComputeT,                       ///< Dominant compute type
-    BlockLoadAlgorithm  _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys (and values)
-    RadixRankAlgorithm  _RANK_ALGORITHM,                ///< The radix ranking algorithm to use
-    BlockScanAlgorithm  _SCAN_ALGORITHM,                ///< The block scan algorithm to use
-    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
-    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentRadixSortDownsweepPolicy :
-    ScalingType
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          RadixRankAlgorithm _RANK_ALGORITHM,
+          BlockScanAlgorithm _SCAN_ALGORITHM,
+          int _RADIX_BITS,
+          typename ScalingType =
+            RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentRadixSortDownsweepPolicy : ScalingType
 {
-    enum
-    {
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-    };
+  enum
+  {
+    /// The number of radix bits, i.e., log2(bins)
+    RADIX_BITS = _RADIX_BITS,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+
+  /// Cache load modifier for reading keys (and values)
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
 
-    static constexpr BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
-    static constexpr CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
-    static constexpr RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
-    static constexpr BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+  /// The radix ranking algorithm to use
+  static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 };
 
 
@@ -99,15 +130,30 @@ struct AgentRadixSortDownsweepPolicy :
 
 
 /**
- * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ * @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in 
+ *        device-wide radix sort downsweep .
+ *
+ * @tparam AgentRadixSortDownsweepPolicy
+ *   Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam ValueT
+ *   ValueT type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
-    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,                              ///< KeyT type
-    typename ValueT,                            ///< ValueT type
-    typename OffsetT,                           ///< Signed integer type for global offsets
-    typename DecomposerT = detail::identity_decomposer_t>
+template <typename AgentRadixSortDownsweepPolicy,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
 struct AgentRadixSortDownsweep
 {
     //---------------------------------------------------------------------
diff --git a/cub/cub/agent/agent_radix_sort_upsweep.cuh b/cub/cub/agent/agent_radix_sort_upsweep.cuh
index da048835da9..c21f1b41baa 100644
--- a/cub/cub/agent/agent_radix_sort_upsweep.cuh
+++ b/cub/cub/agent/agent_radix_sort_upsweep.cuh
@@ -56,39 +56,63 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ * @brief Parameterizable tuning policy type for AgentRadixSortUpsweep
+ *
+ * @tparam NOMINAL_BLOCK_THREADS_4B
+ *   Threads per thread block
+ *
+ * @tparam NOMINAL_ITEMS_PER_THREAD_4B
+ *   Items per thread (per tile of input)
+ *
+ * @tparam ComputeT
+ *   Dominant compute type
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading keys
+ *
+ * @tparam _RADIX_BITS
+ *   The number of radix bits, i.e., log2(bins)
  */
-template <
-    int                 NOMINAL_BLOCK_THREADS_4B,       ///< Threads per thread block
-    int                 NOMINAL_ITEMS_PER_THREAD_4B,    ///< Items per thread (per tile of input)
-    typename            ComputeT,                       ///< Dominant compute type
-    CacheLoadModifier   _LOAD_MODIFIER,                 ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS,                    ///< The number of radix bits, i.e., log2(bins)
-    typename            ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
-struct AgentRadixSortUpsweepPolicy :
-    ScalingType
+template <int NOMINAL_BLOCK_THREADS_4B,
+          int NOMINAL_ITEMS_PER_THREAD_4B,
+          typename ComputeT,
+          CacheLoadModifier _LOAD_MODIFIER,
+          int _RADIX_BITS,
+          typename ScalingType =
+            RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
+struct AgentRadixSortUpsweepPolicy : ScalingType
 {
-    enum
-    {
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+  enum
+  {
+    /// The number of radix bits, i.e., log2(bins)
+    RADIX_BITS = _RADIX_BITS,
+  };
+
+  /// Cache load modifier for reading keys
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
 };
 
-
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
 /**
- * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ * @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for
+ * participating in device-wide radix sort upsweep .
+ *
+ * @tparam AgentRadixSortUpsweepPolicy
+ *   Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+ *
+ * @tparam KeyT
+ *   KeyT type
+ *
+ * @tparam DecomposerT = detail::identity_decomposer_t
+ *   Signed integer type for global offsets
  */
-template <
-    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
-    typename KeyT,                          ///< KeyT type
-    typename OffsetT,
-    typename DecomposerT = detail::identity_decomposer_t>                       ///< Signed integer type for global offsets
+template <typename AgentRadixSortUpsweepPolicy,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
 struct AgentRadixSortUpsweep
 {
 
@@ -483,11 +507,14 @@ struct AgentRadixSortUpsweep
 
 
     /**
-     * Extract counts
+     * @brief Extract counts
+     *
+     * @param[out] bin_count
+     *   The exclusive prefix sum for the digits 
+     *   [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
      */
     template <int BINS_TRACKED_PER_THREAD>
-    __device__ __forceinline__ void ExtractCounts(
-        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    __device__ __forceinline__ void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
     {
         unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
         unsigned int warp_tid   = LaneId();
diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh
index 15f27bd4c53..733dea60020 100644
--- a/cub/cub/agent/agent_rle.cuh
+++ b/cub/cub/agent/agent_rle.cuh
@@ -96,41 +96,66 @@ template <int _BLOCK_THREADS,
           typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
 struct AgentRlePolicy
 {
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
 
-    static constexpr BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static constexpr BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
 
-    struct detail
-    {
-        using delay_constructor_t = DelayConstructorT;
-    };
-};
+    /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced
+    /// among block-warps during any store-related data transpositions (versus each warp having its
+    /// own storage)
+    STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,
+  };
 
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
 
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
 
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
 /**
- * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode
+ * @brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ *
+ * @tparam AgentRlePolicyT
+ *   Parameterized AgentRlePolicyT tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for data
+ *
+ * @tparam OffsetsOutputIteratorT
+ *   Random-access output iterator type for offset values
+ *
+ * @tparam LengthsOutputIteratorT
+ *   Random-access output iterator type for length values
+ *
+ * @tparam EqualityOpT
+ *   T equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
-    typename    InputIteratorT,         ///< Random-access input iterator type for data
-    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
-    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
-    typename    EqualityOpT,            ///< T equality operator type
-    typename    OffsetT>                ///< Signed integer type for global offsets
+template <typename AgentRlePolicyT,
+          typename InputIteratorT,
+          typename OffsetsOutputIteratorT,
+          typename LengthsOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
 struct AgentRle
 {
     //---------------------------------------------------------------------
@@ -153,22 +178,23 @@ struct AgentRle
     // Constants
     enum
     {
-        WARP_THREADS            = CUB_WARP_THREADS(0),
-        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
-        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+      WARP_THREADS     = CUB_WARP_THREADS(0),
+      BLOCK_THREADS    = AgentRlePolicyT::BLOCK_THREADS,
+      ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD,
+      WARP_ITEMS       = WARP_THREADS * ITEMS_PER_THREAD,
+      TILE_ITEMS       = BLOCK_THREADS * ITEMS_PER_THREAD,
+      WARPS            = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+      /// Whether or not to sync after loading data
+      SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+      /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced
+      /// among block-warps during any store-related data transpositions (versus each warp having
+      /// its own storage)
+      STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+      ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
     };
 
-
     /**
      * Special operator that signals all out-of-bounds items are not equal to everything else,
      * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
@@ -248,10 +274,17 @@ struct AgentRle
         {
             struct ScanStorage
             {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
-                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+                // Smem needed for discontinuity detection
+                typename BlockDiscontinuityT::TempStorage discontinuity;
+
+                // Smem needed for warp-synchronous scans
+                typename WarpScanPairs::TempStorage warp_scan[WARPS];
+
+                // Smem needed for sharing warp-wide aggregates
+                Uninitialized<LengthOffsetPair[WARPS]> warp_aggregates;
+
+                // Smem needed for cooperative prefix callback
+                typename TilePrefixCallbackOpT::TempStorage prefix;
             } scan_storage;
 
             // Smem needed for input loading
@@ -268,9 +301,9 @@ struct AgentRle
 
         } aliasable;
 
-        OffsetT             tile_idx;                   // Shared tile index
-        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
-        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+        OffsetT tile_idx;                // Shared tile index
+        LengthOffsetPair tile_inclusive; // Inclusive tile prefix
+        LengthOffsetPair tile_exclusive; // Exclusive tile prefix
     };
 
     // Alias wrapper allowing storage to be unioned
@@ -281,41 +314,54 @@ struct AgentRle
     // Per-thread fields
     //---------------------------------------------------------------------
 
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-
-    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
-    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
-    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+    _TempStorage &temp_storage; ///< Reference to temp_storage
 
-    EqualityOpT                     equality_op;        ///< T equality operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
-    OffsetT                         num_items;          ///< Total number of input items
+    WrappedInputIteratorT d_in;           ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets
+    LengthsOutputIteratorT d_lengths_out; ///< Output run lengths
 
+    EqualityOpT equality_op;    ///< T equality operator
+    ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator
+    OffsetT num_items;          ///< Total number of input items
 
     //---------------------------------------------------------------------
     // Constructor
     //---------------------------------------------------------------------
 
-    // Constructor
-    __device__ __forceinline__
-    AgentRle(
-        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
-        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
-        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
-        EqualityOpT                 equality_op,        ///< [in] T equality operator
-        OffsetT                     num_items)          ///< [in] Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_offsets_out(d_offsets_out),
-        d_lengths_out(d_lengths_out),
-        equality_op(equality_op),
-        scan_op(cub::Sum()),
-        num_items(num_items)
+    /**
+     * @param[in] temp_storage 
+     *   Reference to temp_storage
+     *
+     * @param[in] d_in 
+     *   Pointer to input sequence of data items
+     *
+     * @param[out] d_offsets_out 
+     *   Pointer to output sequence of run offsets
+     *
+     * @param[out] d_lengths_out 
+     *   Pointer to output sequence of run lengths
+     *
+     * @param[in] equality_op 
+     *   Equality operator
+     *
+     * @param[in] num_items 
+     *   Total number of input items
+     */
+    __device__ __forceinline__ AgentRle(TempStorage &temp_storage,
+                                        InputIteratorT d_in,
+                                        OffsetsOutputIteratorT d_offsets_out,
+                                        LengthsOutputIteratorT d_lengths_out,
+                                        EqualityOpT equality_op,
+                                        OffsetT num_items)
+        : temp_storage(temp_storage.Alias())
+        , d_in(d_in)
+        , d_offsets_out(d_offsets_out)
+        , d_lengths_out(d_lengths_out)
+        , equality_op(equality_op)
+        , scan_op(cub::Sum())
+        , num_items(num_items)
     {}
 
-
     //---------------------------------------------------------------------
     // Utility methods for initializing the selections
     //---------------------------------------------------------------------
@@ -683,16 +729,29 @@ struct AgentRle
     //---------------------------------------------------------------------
 
     /**
-     * Process a tile of input (dynamic chained scan)
+     * @brief Process a tile of input (dynamic chained scan)
+     *
+     * @param num_items 
+     *   Total number of global input items
+     *
+     * @param num_remaining 
+     *   Number of global input items remaining (including this tile)
+     *
+     * @param tile_idx 
+     *   Tile index
+     *
+     * @param tile_offset 
+     *   Tile offset
+     *
+     * @param &tile_status 
+     *   Global list of tile status
      */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
-        OffsetT             num_items,          ///< Total number of global input items
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    template <bool LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(OffsetT num_items,
+                                                            OffsetT num_remaining,
+                                                            int tile_idx,
+                                                            OffsetT tile_offset,
+                                                            ScanTileStateT &tile_status)
     {
         if (tile_idx == 0)
         {
@@ -892,13 +951,24 @@ struct AgentRle
 
 
     /**
-     * Scan tiles of items as part of a dynamic chained scan
+     * @brief Scan tiles of items as part of a dynamic chained scan
+     *
+     * @param num_tiles 
+     *   Total number of input tiles
+     *
+     * @param tile_status 
+     *   Global list of tile status
+     *
+     * @param d_num_runs_out 
+     *   Output pointer for total number of runs identified
+     *
+     * @tparam NumRunsIteratorT
+     *   Output iterator type for recording number of items selected
      */
-    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_tiles,              ///< Total number of input tiles
-        ScanTileStateT&     tile_status,            ///< Global list of tile status
-        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    template <typename NumRunsIteratorT>
+    __device__ __forceinline__ void ConsumeRange(int num_tiles,
+                                                 ScanTileStateT &tile_status,
+                                                 NumRunsIteratorT d_num_runs_out)
     {
         // Blocks are launched in increasing order, so just assign one tile per block
         int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh
index 84d8af5aa6a..205970eaef3 100644
--- a/cub/cub/agent/agent_segment_fixup.cuh
+++ b/cub/cub/agent/agent_segment_fixup.cuh
@@ -59,42 +59,81 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * Parameterizable tuning policy type for AgentSegmentFixup
+ * @brief Parameterizable tuning policy type for AgentSegmentFixup
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _LOAD_ALGORITHM
+ *   The BlockLoad algorithm to use
+ *
+ * @tparam _LOAD_MODIFIER
+ *   Cache load modifier for reading input elements
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
  */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          BlockLoadAlgorithm _LOAD_ALGORITHM,
+          CacheLoadModifier _LOAD_MODIFIER,
+          BlockScanAlgorithm _SCAN_ALGORITHM>
 struct AgentSegmentFixupPolicy
 {
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
 
-    static constexpr BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static constexpr BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+  };
+
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
 
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
 /**
- * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ * @brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for
+ * participating in device-wide reduce-value-by-key
+ *
+ * @tparam AgentSegmentFixupPolicyT
+ *   Parameterized AgentSegmentFixupPolicy tuning policy type
+ *
+ * @tparam PairsInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam EqualityOpT
+ *   KeyT equality operator type
+ *
+ * @tparam ReductionOpT
+ *   ValueT reduction operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    EqualityOpT,                    ///< KeyT equality operator type
-    typename    ReductionOpT,                   ///< ValueT reduction operator type
-    typename    OffsetT>                        ///< Signed integer type for global offsets
+template <typename AgentSegmentFixupPolicyT,
+          typename PairsInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename EqualityOpT,
+          typename ReductionOpT,
+          typename OffsetT>
 struct AgentSegmentFixup
 {
     //---------------------------------------------------------------------
@@ -172,8 +211,11 @@ struct AgentSegmentFixup
     {
         struct ScanStorage
         {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+          // Smem needed for tile scanning
+          typename BlockScanT::TempStorage scan;
+
+          // Smem needed for cooperative prefix callback
+          typename TilePrefixCallbackOpT::TempStorage prefix;
         } scan_storage;
 
         // Smem needed for loading keys
@@ -188,53 +230,77 @@ struct AgentSegmentFixup
     // Per-thread fields
     //---------------------------------------------------------------------
 
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
-    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
-    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
-    ReductionOpT                    reduction_op;       ///< Reduction operator
-    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
-
+    _TempStorage &temp_storage;                   ///< Reference to temp_storage
+    WrappedPairsInputIteratorT d_pairs_in;        ///< Input keys
+    AggregatesOutputIteratorT d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT d_fixup_in;        ///< Fixup input values
+    InequalityWrapper<EqualityOpT> inequality_op; ///< KeyT inequality operator
+    ReductionOpT reduction_op;                    ///< Reduction operator
+    ReduceBySegmentOpT scan_op;                   ///< Reduce-by-segment scan operator
 
     //---------------------------------------------------------------------
     // Constructor
     //---------------------------------------------------------------------
 
-    // Constructor
-    __device__ __forceinline__
-    AgentSegmentFixup(
-        TempStorage&                temp_storage,       ///< Reference to temp_storage
-        PairsInputIteratorT         d_pairs_in,          ///< Input keys
-        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
-        EqualityOpT                 equality_op,        ///< KeyT equality operator
-        ReductionOpT                reduction_op)       ///< ValueT reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_pairs_in(d_pairs_in),
-        d_aggregates_out(d_aggregates_out),
-        d_fixup_in(d_aggregates_out),
-        inequality_op(equality_op),
-        reduction_op(reduction_op),
-        scan_op(reduction_op)
+    /**
+     * @param temp_storage 
+     *   Reference to temp_storage
+     *
+     * @param d_pairs_in 
+     *   Input keys
+     *
+     * @param d_aggregates_out 
+     *   Output value aggregates
+     *
+     * @param equality_op 
+     *   KeyT equality operator
+     *
+     * @param reduction_op 
+     *   ValueT reduction operator
+     */
+    __device__ __forceinline__ AgentSegmentFixup(TempStorage &temp_storage,
+                                                 PairsInputIteratorT d_pairs_in,
+                                                 AggregatesOutputIteratorT d_aggregates_out,
+                                                 EqualityOpT equality_op,
+                                                 ReductionOpT reduction_op)
+        : temp_storage(temp_storage.Alias())
+        , d_pairs_in(d_pairs_in)
+        , d_aggregates_out(d_aggregates_out)
+        , d_fixup_in(d_aggregates_out)
+        , inequality_op(equality_op)
+        , reduction_op(reduction_op)
+        , scan_op(reduction_op)
     {}
 
-
     //---------------------------------------------------------------------
     // Cooperatively scan a device-wide sequence of tiles with other CTAs
     //---------------------------------------------------------------------
 
 
     /**
-     * Process input tile.  Specialized for atomic-fixup
+     * @brief Process input tile. Specialized for atomic-fixup
+     *
+     * @param num_remaining 
+     *   Number of global input items remaining (including this tile)
+     *
+     * @param tile_idx 
+     *   Tile index
+     *
+     * @param tile_offset 
+     *   Tile offset
+     *
+     * @param tile_state 
+     *   Global tile state descriptor
+     *
+     * @param use_atomic_fixup 
+     *   Marker whether to use atomicAdd (instead of reduce-by-key)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining,
+                                                int tile_idx,
+                                                OffsetT tile_offset,
+                                                ScanTileStateT &tile_state,
+                                                Int2Type<true> use_atomic_fixup)
     {
         KeyValuePairT   pairs[ITEMS_PER_THREAD];
 
@@ -264,17 +330,30 @@ struct AgentSegmentFixup
             atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
     }
 
-
     /**
-     * Process input tile.  Specialized for reduce-by-key fixup
+     * @brief Process input tile. Specialized for reduce-by-key fixup
+     *
+     * @param num_remaining
+     *   Number of global input items remaining (including this tile)
+     *
+     * @param tile_idx
+     *   Tile index
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @param use_atomic_fixup
+     *   Marker whether to use atomicAdd (instead of reduce-by-key)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
-        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    __device__ __forceinline__ void ConsumeTile(OffsetT num_remaining,
+                                                int tile_idx,
+                                                OffsetT tile_offset,
+                                                ScanTileStateT &tile_state,
+                                                Int2Type<false> use_atomic_fixup)
     {
         KeyValuePairT   pairs[ITEMS_PER_THREAD];
         KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
@@ -346,19 +425,26 @@ struct AgentSegmentFixup
         }
     }
 
-
     /**
-     * Scan tiles of items as part of a dynamic chained scan
+     * @brief Scan tiles of items as part of a dynamic chained scan
+     *
+     * @param num_items
+     *   Total number of input items
+     *
+     * @param num_tiles
+     *   Total number of input tiles
+     *
+     * @param tile_state
+     *   Global tile state descriptor
      */
-    __device__ __forceinline__ void ConsumeRange(
-        OffsetT             num_items,          ///< Total number of input items
-        int                 num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ void ConsumeRange(OffsetT num_items,
+                                                 int num_tiles,
+                                                 ScanTileStateT &tile_state)
     {
         // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
-        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+        int tile_idx          = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+        OffsetT tile_offset   = tile_idx * TILE_ITEMS;   // Global offset for the current tile
+        OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile)
 
         if (num_remaining > TILE_ITEMS)
         {
diff --git a/cub/cub/agent/agent_select_if.cuh b/cub/cub/agent/agent_select_if.cuh
index 564316b30dd..e5534605716 100644
--- a/cub/cub/agent/agent_select_if.cuh
+++ b/cub/cub/agent/agent_select_if.cuh
@@ -89,46 +89,77 @@ template <int _BLOCK_THREADS,
           typename DelayConstructorT = detail::fixed_delay_constructor_t<350, 450>>
 struct AgentSelectIfPolicy
 {
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
 
-    static constexpr BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static constexpr CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static constexpr BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+  };
 
-    struct detail
-    {
-        using delay_constructor_t = DelayConstructorT;
-    };
-};
+  /// The BlockLoad algorithm to use
+  static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
 
+  /// Cache load modifier for reading input elements
+  static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
 
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
 
+  struct detail
+  {
+    using delay_constructor_t = DelayConstructorT;
+  };
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
-
 /**
- * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ * @brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in
+ * device-wide selection
  *
  * Performs functor-based selection if SelectOpT functor type != NullType
  * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
  * Otherwise performs discontinuity selection (keep unique)
+ *
+ * @tparam AgentSelectIfPolicyT
+ *   Parameterized AgentSelectIfPolicy tuning policy type
+ *
+ * @tparam InputIteratorT
+ *   Random-access input iterator type for selection items
+ *
+ * @tparam FlagsInputIteratorT
+ *   Random-access input iterator type for selections (NullType* if a selection functor or
+ *   discontinuity flagging is to be used for selection)
+ *
+ * @tparam SelectedOutputIteratorT
+ *   Random-access output iterator type for selection_flags items
+ *
+ * @tparam SelectOpT
+ *   Selection operator type (NullType if selections or discontinuity flagging is to be used for
+ * selection)
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type (NullType if selection functor or selections is to be used for
+ * selection)
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam KEEP_REJECTS
+ *   Whether or not we push rejected items to the back of the output
  */
-template <
-    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
-    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
-    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for selection_flags items
-    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+template <typename AgentSelectIfPolicyT,
+          typename InputIteratorT,
+          typename FlagsInputIteratorT,
+          typename SelectedOutputIteratorT,
+          typename SelectOpT,
+          typename EqualityOpT,
+          typename OffsetT,
+          bool KEEP_REJECTS>
 struct AgentSelectIf
 {
     //---------------------------------------------------------------------
@@ -215,9 +246,14 @@ struct AgentSelectIf
     {
         struct ScanStorage
         {
-            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
-            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
-            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+          // Smem needed for tile scanning
+          typename BlockScanT::TempStorage scan;
+
+          // Smem needed for cooperative prefix callback
+          typename TilePrefixCallbackOpT::TempStorage prefix;
+
+          // Smem needed for discontinuity detection
+          typename BlockDiscontinuityT::TempStorage discontinuity;
         } scan_storage;
 
         // Smem needed for loading items
@@ -238,40 +274,56 @@ struct AgentSelectIf
     // Per-thread fields
     //---------------------------------------------------------------------
 
-    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
-    WrappedInputIteratorT           d_in;               ///< Input items
-    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
-    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
-    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
-    SelectOpT                       select_op;          ///< Selection operator
-    OffsetT                         num_items;          ///< Total number of input items
-
+    _TempStorage &temp_storage;                   ///< Reference to temp_storage
+    WrappedInputIteratorT d_in;                   ///< Input items
+    SelectedOutputIteratorT d_selected_out;       ///< Unique output items
+    WrappedFlagsInputIteratorT d_flags_in;        ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT> inequality_op; ///< T inequality operator
+    SelectOpT select_op;                          ///< Selection operator
+    OffsetT num_items;                            ///< Total number of input items
 
     //---------------------------------------------------------------------
     // Constructor
     //---------------------------------------------------------------------
 
-    // Constructor
-    __device__ __forceinline__
-    AgentSelectIf(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIteratorT              d_in,               ///< Input data
-        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
-        SelectedOutputIteratorT     d_selected_out,     ///< Output data
-        SelectOpT                   select_op,          ///< Selection operator
-        EqualityOpT                 equality_op,        ///< Equality operator
-        OffsetT                     num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_selected_out(d_selected_out),
-        d_flags_in(d_flags_in),
-        inequality_op(equality_op),
-        select_op(select_op),
-        num_items(num_items)
+    /**
+     * @param temp_storage
+     *   Reference to temp_storage
+     *
+     * @param d_in
+     *   Input data
+     *
+     * @param d_flags_in
+     *   Input selection flags (if applicable)
+     *
+     * @param d_selected_out
+     *   Output data
+     *
+     * @param select_op
+     *   Selection operator
+     *
+     * @param equality_op
+     *   Equality operator
+     *
+     * @param num_items
+     *   Total number of input items
+     */
+    __device__ __forceinline__ AgentSelectIf(TempStorage &temp_storage,
+                                             InputIteratorT d_in,
+                                             FlagsInputIteratorT d_flags_in,
+                                             SelectedOutputIteratorT d_selected_out,
+                                             SelectOpT select_op,
+                                             EqualityOpT equality_op,
+                                             OffsetT num_items)
+        : temp_storage(temp_storage.Alias())
+        , d_in(d_in)
+        , d_selected_out(d_selected_out)
+        , d_flags_in(d_flags_in)
+        , inequality_op(equality_op)
+        , select_op(select_op)
+        , num_items(num_items)
     {}
 
-
     //---------------------------------------------------------------------
     // Utility methods for initializing the selections
     //---------------------------------------------------------------------
@@ -401,20 +453,33 @@ struct AgentSelectIf
         }
     }
 
-
     /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     * @brief Scatter flagged items to output offsets (specialized for two-phase scattering)
+     *
+     * @param num_tile_items
+     *   Number of valid items in this tile
+     *
+     * @param num_tile_selections
+     *   Number of selections in this tile
+     *
+     * @param num_selections_prefix
+     *   Total number of selections prior to this tile
+     *
+     * @param num_rejected_prefix
+     *   Total number of rejections prior to this tile
+     *
+     * @param is_keep_rejects
+     *   Marker type indicating whether to keep rejected items in the second partition
      */
     template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        InputT          (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
-        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    __device__ __forceinline__ void ScatterTwoPhase(InputT (&items)[ITEMS_PER_THREAD],
+                                                    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+                                                    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+                                                    int /*num_tile_items*/,
+                                                    int num_tile_selections,
+                                                    OffsetT num_selections_prefix,
+                                                    OffsetT /*num_rejected_prefix*/,
+                                                    Int2Type<false> /*is_keep_rejects*/)
     {
         CTA_SYNC();
 
@@ -437,20 +502,33 @@ struct AgentSelectIf
         }
     }
 
-
     /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     * @brief Scatter flagged items to output offsets (specialized for two-phase scattering)
+     *
+     * @param num_tile_items
+     *   Number of valid items in this tile
+     *
+     * @param num_tile_selections
+     *   Number of selections in this tile
+     *
+     * @param num_selections_prefix
+     *   Total number of selections prior to this tile
+     *
+     * @param num_rejected_prefix
+     *   Total number of rejections prior to this tile
+     *
+     * @param is_keep_rejects
+     *   Marker type indicating whether to keep rejected items in the second partition
      */
     template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        InputT          (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    __device__ __forceinline__ void ScatterTwoPhase(InputT (&items)[ITEMS_PER_THREAD],
+                                                    OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+                                                    OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+                                                    int num_tile_items,
+                                                    int num_tile_selections,
+                                                    OffsetT num_selections_prefix,
+                                                    OffsetT num_rejected_prefix,
+                                                    Int2Type<true> /*is_keep_rejects*/)
     {
         CTA_SYNC();
 
@@ -492,20 +570,33 @@ struct AgentSelectIf
         }
     }
 
-
     /**
-     * Scatter flagged items
+     * @brief Scatter flagged items
+     *
+     * @param num_tile_items
+     *   Number of valid items in this tile
+     *
+     * @param num_tile_selections
+     *   Number of selections in this tile
+     *
+     * @param num_selections_prefix
+     *   Total number of selections prior to this tile
+     *
+     * @param num_rejected_prefix
+     *   Total number of rejections prior to this tile
+     *
+     * @param num_selections
+     *   Total number of selections including this tile
      */
     template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        InputT          (&items)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
-        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
-        int             num_tile_items,                             ///< Number of valid items in this tile
-        int             num_tile_selections,                        ///< Number of selections in this tile
-        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
-        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
-        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    __device__ __forceinline__ void Scatter(InputT (&items)[ITEMS_PER_THREAD],
+                                            OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+                                            OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+                                            int num_tile_items,
+                                            int num_tile_selections,
+                                            OffsetT num_selections_prefix,
+                                            OffsetT num_rejected_prefix,
+                                            OffsetT num_selections)
     {
         // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
         if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
@@ -536,13 +627,23 @@ struct AgentSelectIf
 
 
     /**
-     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     * @brief Process first tile of input (dynamic chained scan).  
+     *
+     * @param num_tile_items 
+     *   Number of input items comprising this tile
+     *
+     * @param tile_offset 
+     *   Tile offset
+     *
+     * @param tile_state 
+     *   Global tile state descriptor
+     *
+     * @return The running count of selections (including this tile)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(int num_tile_items,
+                                                        OffsetT tile_offset,
+                                                        ScanTileStateT &tile_state)
     {
         InputT      items[ITEMS_PER_THREAD];
         OffsetT     selection_flags[ITEMS_PER_THREAD];
@@ -593,16 +694,28 @@ struct AgentSelectIf
         return num_tile_selections;
     }
 
-
     /**
-     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     * @brief Process subsequent tile of input (dynamic chained scan).
+     *
+     * @param num_tile_items
+     *   Number of input items comprising this tile
+     *
+     * @param tile_idx
+     *   Tile index
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @return The running count of selections (including this tile)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(int num_tile_items,
+                                                             int tile_idx,
+                                                             OffsetT tile_offset,
+                                                             ScanTileStateT &tile_state)
     {
         InputT      items[ITEMS_PER_THREAD];
         OffsetT     selection_flags[ITEMS_PER_THREAD];
@@ -657,14 +770,23 @@ struct AgentSelectIf
 
 
     /**
-     * Process a tile of input
+     * @brief Process a tile of input
+     *
+     * @param num_tile_items
+     *   Number of input items comprising this tile
+     *
+     * @param tile_idx 
+     *   Tile index
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT
+    ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state)
     {
         OffsetT num_selections;
         if (tile_idx == 0)
@@ -679,19 +801,29 @@ struct AgentSelectIf
         return num_selections;
     }
 
-
     /**
-     * Scan tiles of items as part of a dynamic chained scan
+     * @brief Scan tiles of items as part of a dynamic chained scan
+     *
+     * @param num_tiles
+     *   Total number of input tiles
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @param d_num_selected_out
+     *   Output total number selection_flags
+     *
+     * @tparam NumSelectedIteratorT
+     *   Output iterator type for recording number of items selection_flags
      */
-    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
-        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    template <typename NumSelectedIteratorT>
+    __device__ __forceinline__ void ConsumeRange(int num_tiles,
+                                                 ScanTileStateT &tile_state,
+                                                 NumSelectedIteratorT d_num_selected_out)
     {
         // Blocks are launched in increasing order, so just assign one tile per block
-        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
-        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        int tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index
+        OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile
 
         if (tile_idx < num_tiles - 1)
         {
diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh
index 4d9ce6ce582..4db94ed6693 100644
--- a/cub/cub/agent/agent_spmv_orig.cuh
+++ b/cub/cub/agent/agent_spmv_orig.cuh
@@ -60,69 +60,155 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * Parameterizable tuning policy type for AgentSpmv
+ * @param Parameterizable tuning policy type for AgentSpmv
+ *
+ * @tparam _BLOCK_THREADS
+ *   Threads per thread block
+ *
+ * @tparam _ITEMS_PER_THREAD
+ *   Items per thread (per tile of input)
+ *
+ * @tparam _ROW_OFFSETS_SEARCH_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR row-offsets during search
+ *
+ * @tparam _ROW_OFFSETS_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR row-offsets
+ *
+ * @tparam _COLUMN_INDICES_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR column-indices
+ *
+ * @tparam _VALUES_LOAD_MODIFIER
+ *   Cache load modifier for reading CSR values
+ *
+ * @tparam _VECTOR_VALUES_LOAD_MODIFIER
+ *   Cache load modifier for reading vector values
+ *
+ * @tparam _DIRECT_LOAD_NONZEROS
+ *   Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through
+ * shared memory)
+ *
+ * @tparam _SCAN_ALGORITHM
+ *   The BlockScan algorithm to use
  */
-template <
-    int                             _BLOCK_THREADS,                         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
-    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
-    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
-    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
-    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
-    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
-    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
-    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+template <int _BLOCK_THREADS,
+          int _ITEMS_PER_THREAD,
+          CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+          CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER,
+          CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER,
+          CacheLoadModifier _VALUES_LOAD_MODIFIER,
+          CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER,
+          bool _DIRECT_LOAD_NONZEROS,
+          BlockScanAlgorithm _SCAN_ALGORITHM>
 struct AgentSpmvPolicy
 {
-    enum
-    {
-        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
-        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
-        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
-    };
+  enum
+  {
+    /// Threads per thread block
+    BLOCK_THREADS = _BLOCK_THREADS,
 
-    static constexpr CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
-    static constexpr CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
-    static constexpr CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
-    static constexpr CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
-    static constexpr CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
-    static constexpr BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+    /// Items per thread (per tile of input)
+    ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
 
-};
+    /// Whether to load nonzeros directly from global during sequential merging (pre-staged through
+    /// shared memory)
+    DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS,
+  };
+
+  /// Cache load modifier for reading CSR row-offsets
+  static constexpr CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER =
+    _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading CSR row-offsets
+  static constexpr CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER;
 
+  /// Cache load modifier for reading CSR column-indices
+  static constexpr CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading CSR values
+  static constexpr CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER;
+
+  /// Cache load modifier for reading vector values
+  static constexpr CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER;
+
+  /// The BlockScan algorithm to use
+  static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+};
 
 /******************************************************************************
  * Thread block abstractions
  ******************************************************************************/
 
-template <
-    typename        ValueT,              ///< Matrix and vector value type
-    typename        OffsetT>             ///< Signed integer type for sequence offsets
+/**
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ */
+template <typename ValueT, typename OffsetT>
 struct SpmvParams
 {
-    const ValueT*   d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    const OffsetT*  d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    const OffsetT*  d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    const ValueT*   d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
-    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
-    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
-    ValueT          alpha;               ///< Alpha multiplicand
-    ValueT          beta;                ///< Beta addend-multiplicand
-};
+  /// Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix
+  /// <b>A</b>.
+  const ValueT *d_values;
+
+  /// Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices
+  /// and \p d_values
+  const OffsetT *d_row_end_offsets;
+
+  /// Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements
+  /// of matrix <b>A</b>.  (Indices are zero-valued.)
+  const OffsetT *d_column_indices;
+
+  /// Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+  const ValueT *d_vector_x;
+
+  /// Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+  ValueT *d_vector_y;
+
+  /// Number of rows of matrix <b>A</b>.
+  int num_rows;
+
+  /// Number of columns of matrix <b>A</b>.
+  int num_cols;
+
+  /// Number of nonzero elements of matrix <b>A</b>.
+  int num_nonzeros;
 
+  /// Alpha multiplicand
+  ValueT alpha;
+
+  /// Beta addend-multiplicand
+  ValueT beta;
+};
 
 /**
- * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ * @brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ *
+ * @tparam AgentSpmvPolicyT
+ *   Parameterized AgentSpmvPolicy tuning policy type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam HAS_ALPHA
+ *   Whether the input parameter \p alpha is 1
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter \p beta is 0
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   PTX compute capability (unused)
  */
-template <
-    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT,                    ///< Signed integer type for sequence offsets
-    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
-    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
-    int         LEGACY_PTX_ARCH = 0>        ///< PTX compute capability (unused)
+template <typename AgentSpmvPolicyT,
+          typename ValueT,
+          typename OffsetT,
+          bool HAS_ALPHA,
+          bool HAS_BETA,
+          int LEGACY_PTX_ARCH = 0>
 struct AgentSpmv
 {
     //---------------------------------------------------------------------
@@ -252,49 +338,66 @@ struct AgentSpmv
     // Per-thread fields
     //---------------------------------------------------------------------
 
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    SpmvParams<ValueT, OffsetT> &spmv_params;
+
+    /// Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements
+    /// of matrix <b>A</b>.
+    ValueIteratorT wd_values;
 
-    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+    /// Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p
+    /// d_column_indices and \p d_values
+    RowOffsetsIteratorT wd_row_end_offsets;
 
-    SpmvParams<ValueT, OffsetT>&    spmv_params;
+    /// Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero
+    /// elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ColumnIndicesIteratorT wd_column_indices;
 
-    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
-    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
+    /// <em>x</em>
+    VectorValueIteratorT wd_vector_x;
 
+    /// Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector
+    /// <em>x</em>
+    VectorValueIteratorT wd_vector_y;
 
     //---------------------------------------------------------------------
     // Interface
     //---------------------------------------------------------------------
 
     /**
-     * Constructor
+     * @param temp_storage
+     *   Reference to temp_storage
+     *
+     * @param spmv_params
+     *   SpMV input parameter bundle
      */
-    __device__ __forceinline__ AgentSpmv(
-        TempStorage&                    temp_storage,           ///< Reference to temp_storage
-        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
-    :
-        temp_storage(temp_storage.Alias()),
-        spmv_params(spmv_params),
-        wd_values(spmv_params.d_values),
-        wd_row_end_offsets(spmv_params.d_row_end_offsets),
-        wd_column_indices(spmv_params.d_column_indices),
-        wd_vector_x(spmv_params.d_vector_x),
-        wd_vector_y(spmv_params.d_vector_y)
+    __device__ __forceinline__ AgentSpmv(TempStorage &temp_storage,
+                                         SpmvParams<ValueT, OffsetT> &spmv_params)
+        : temp_storage(temp_storage.Alias())
+        , spmv_params(spmv_params)
+        , wd_values(spmv_params.d_values)
+        , wd_row_end_offsets(spmv_params.d_row_end_offsets)
+        , wd_column_indices(spmv_params.d_column_indices)
+        , wd_vector_x(spmv_params.d_vector_x)
+        , wd_vector_y(spmv_params.d_vector_y)
     {}
 
 
 
 
     /**
-     * Consume a merge tile, specialized for direct-load of nonzeros
+     * @brief Consume a merge tile, specialized for direct-load of nonzeros
+     *
+     * @param is_direct_load
+     *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
      */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx,
+                                                         CoordinateT tile_start_coord,
+                                                         CoordinateT tile_end_coord,
+                                                         Int2Type<true> is_direct_load)
     {
         int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
         int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
@@ -413,13 +516,15 @@ struct AgentSpmv
 
 
     /**
-     * Consume a merge tile, specialized for indirect load of nonzeros
+     * @brief Consume a merge tile, specialized for indirect load of nonzeros
+     *
+     * @param is_direct_load
+     *   Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
      */
-    __device__ __forceinline__ KeyValuePairT ConsumeTile(
-        int             tile_idx,
-        CoordinateT     tile_start_coord,
-        CoordinateT     tile_end_coord,
-        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(int tile_idx,
+                                                         CoordinateT tile_start_coord,
+                                                         CoordinateT tile_end_coord,
+                                                         Int2Type<false> is_direct_load)
     {
         int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
         int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
@@ -601,12 +706,20 @@ struct AgentSpmv
 
 
     /**
-     * Consume input tile
+     * @brief Consume input tile
+     *
+     * @param[in] d_tile_coordinates
+     *   Pointer to the temporary array of tile starting coordinates
+     *
+     * @param[out] d_tile_carry_pairs
+     *   Pointer to the temporary array carry-out dot product row-ids, one per block
+     *
+     * @param[in] num_merge_tiles
+     *   Number of merge tiles
      */
-    __device__ __forceinline__ void ConsumeTile(
-        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
-        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    __device__ __forceinline__ void ConsumeTile(CoordinateT *d_tile_coordinates,
+                                                KeyValuePairT *d_tile_carry_pairs,
+                                                int num_merge_tiles)
     {
         int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
 
diff --git a/cub/cub/agent/agent_unique_by_key.cuh b/cub/cub/agent/agent_unique_by_key.cuh
index a7b9e0a367b..0124759e192 100644
--- a/cub/cub/agent/agent_unique_by_key.cuh
+++ b/cub/cub/agent/agent_unique_by_key.cuh
@@ -26,7 +26,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key.
  */
 
@@ -93,18 +93,38 @@ struct AgentUniqueByKeyPolicy
  * Thread block abstractions
  ******************************************************************************/
 
-
 /**
- * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key
+ * @brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating
+ * in device-wide unique-by-key
+ *
+ * @tparam AgentUniqueByKeyPolicyT
+ *   Parameterized AgentUniqueByKeyPolicy tuning policy type
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename AgentUniqueByKeyPolicyT,           ///< Parameterized AgentUniqueByKeyPolicy tuning policy type
-    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
-    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
-    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
-    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
-    typename EqualityOpT,                       ///< Equality operator type
-    typename OffsetT>                           ///< Signed integer type for global offsets
+template <typename AgentUniqueByKeyPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename EqualityOpT,
+          typename OffsetT>
 struct AgentUniqueByKey
 {
     //---------------------------------------------------------------------
@@ -295,15 +315,24 @@ struct AgentUniqueByKey
     // Cooperatively scan a device-wide sequence of tiles with other CTAs
     //---------------------------------------------------------------------
 
-
     /**
-     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     * @brief Process first tile of input (dynamic chained scan).
+     *
+     * @param num_tile_items
+     *   Number of input items comprising this tile
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @return The running count of selections (including this tile)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeFirstTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(int num_tile_items,
+                                                        OffsetT tile_offset,
+                                                        ScanTileStateT &tile_state)
     {
         KeyT        keys[ITEMS_PER_THREAD];
         OffsetT     selection_flags[ITEMS_PER_THREAD];
@@ -411,14 +440,27 @@ struct AgentUniqueByKey
     }
 
     /**
-     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     * @brief Process subsequent tile of input (dynamic chained scan).  
+     *
+     * @param num_tile_items
+     *   Number of input items comprising this tile
+     *
+     * @param tile_idx
+     *   Tile index
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @return Returns the running count of selections (including this tile)
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(int num_tile_items,
+                                                             int tile_idx,
+                                                             OffsetT tile_offset,
+                                                             ScanTileStateT &tile_state)
     {
         KeyT        keys[ITEMS_PER_THREAD];
         OffsetT     selection_flags[ITEMS_PER_THREAD];
@@ -527,16 +569,24 @@ struct AgentUniqueByKey
         return num_selections;
     }
 
-
     /**
-     * Process a tile of input
+     * @brief Process a tile of input
+     *
+     * @param num_tile_items
+     *   Number of input items comprising this tile
+     *
+     * @param tile_idx
+     *   Tile index
+     *
+     * @param tile_offset
+     *   Tile offset
+     *
+     * @param tile_state
+     *   Global tile state descriptor
      */
     template <bool IS_LAST_TILE>
-    __device__ __forceinline__ OffsetT ConsumeTile(
-        int                 num_tile_items,     ///< Number of input items comprising this tile
-        int                 tile_idx,           ///< Tile index
-        OffsetT             tile_offset,        ///< Tile offset
-        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    __device__ __forceinline__ OffsetT
+    ConsumeTile(int num_tile_items, int tile_idx, OffsetT tile_offset, ScanTileStateT &tile_state)
     {
         OffsetT num_selections;
         if (tile_idx == 0)
@@ -552,13 +602,25 @@ struct AgentUniqueByKey
     }
 
     /**
-     * Scan tiles of items as part of a dynamic chained scan
+     * @brief Scan tiles of items as part of a dynamic chained scan
+     *
+     * @param num_tiles
+     *   Total number of input tiles
+     *
+     * @param tile_state
+     *   Global tile state descriptor
+     *
+     * @param d_num_selected_out
+     *   Output total number selection_flags
+     *
+     * @tparam NumSelectedIteratorT
+     *   Output iterator type for recording number of items selection_flags
+     *
      */
-    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
-        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    template <typename NumSelectedIteratorT>
+    __device__ __forceinline__ void ConsumeRange(int num_tiles,
+                                                 ScanTileStateT &tile_state,
+                                                 NumSelectedIteratorT d_num_selected_out)
     {
         // Blocks are launched in increasing order, so just assign one tile per block
         int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index ff8a4e18347..26481b48356 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -65,43 +65,47 @@ CUB_NAMESPACE_BEGIN
  * Stateful callback operator type for supplying BlockScan prefixes.
  * Maintains a running prefix that can be applied to consecutive
  * BlockScan operations.
+ *
+ * @tparam T
+ *   BlockScan value type
+ *
+ * @tparam ScanOpT
+ *   Wrapped scan operator type
  */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOpT>            ///< Wrapped scan operator type
+template <typename T, typename ScanOpT>
 struct BlockScanRunningPrefixOp
 {
-    ScanOpT     op;                 ///< Wrapped scan operator
-    T           running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOpT op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
+  /// Wrapped scan operator
+  ScanOpT op;
+
+  /// Running block-wide prefix
+  T running_total;
+
+  /// Constructor
+  __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+      : op(op)
+  {}
+
+  /// Constructor
+  __device__ __forceinline__ BlockScanRunningPrefixOp(T starting_prefix, ScanOpT op)
+      : op(op)
+      , running_total(starting_prefix)
+  {}
+
+  /**
+   * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+   *
+   * @param block_aggregate
+   *   The aggregate sum of the BlockScan inputs
+   */
+  __device__ __forceinline__ T operator()(const T &block_aggregate)
+  {
+    T retval      = running_total;
+    running_total = op(running_total, block_aggregate);
+    return retval;
+  }
 };
 
-
 /******************************************************************************
  * Generic tile status interface types for block-cooperative scans
  ******************************************************************************/
@@ -534,32 +538,45 @@ struct ScanTileState<T, true>
         d_tile_descriptors(NULL)
     {}
 
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    /**
+     * @brief Initializer
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to \p temp_storage_bytes and no work is
+     * done.
+     *
+     * @param[in] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
+     */
+    __host__ __device__ __forceinline__ cudaError_t Init(int /*num_tiles*/,
+                                                         void *d_temp_storage,
+                                                         size_t /*temp_storage_bytes*/)
     {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        d_tile_descriptors = reinterpret_cast<TxnWord *>(d_temp_storage);
         return cudaSuccess;
     }
 
-
     /**
-     * Compute device memory needed for tile status
+     * @brief Compute device memory needed for tile status
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[out] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
      */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    __host__ __device__ __forceinline__ static cudaError_t
+    AllocationSize(int num_tiles, size_t &temp_storage_bytes)
     {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);       // bytes needed for tile status descriptors
+        // bytes needed for tile status descriptors
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
         return cudaSuccess;
     }
 
-
     /**
      * Initialize (from device)
      */
@@ -689,13 +706,24 @@ struct ScanTileState<T, false>
         d_tile_inclusive(NULL)
     {}
 
-
+    /**
+     * @brief Initializer
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to \p temp_storage_bytes and no work is
+     *   done.
+     *
+     * @param[in] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
+     */
     /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    __host__ __device__ __forceinline__ cudaError_t Init(int num_tiles,
+                                                         void *d_temp_storage,
+                                                         size_t temp_storage_bytes)
     {
         cudaError_t error = cudaSuccess;
         do
@@ -703,9 +731,14 @@ struct ScanTileState<T, false>
             void*   allocations[3] = {};
             size_t  allocation_sizes[3];
 
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+            // bytes needed for tile status descriptors
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           
+
+            // bytes needed for partials
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     
+
+            // bytes needed for inclusives
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     
 
             // Compute allocation pointers into the single storage blob
             error = CubDebug(
@@ -726,20 +759,29 @@ struct ScanTileState<T, false>
         return error;
     }
 
-
     /**
-     * Compute device memory needed for tile status
+     * @brief Compute device memory needed for tile status
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[out] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
      */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    __host__ __device__ __forceinline__ static cudaError_t
+    AllocationSize(int num_tiles, size_t &temp_storage_bytes)
     {
         // Specify storage allocation requirements
         size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // bytes needed for tile status descriptors
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         
+
+        // bytes needed for partials
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   
+
+        // bytes needed for inclusives
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   
 
         // Set the necessary size of the blob
         void* allocations[3] = {};
@@ -928,32 +970,44 @@ struct ReduceByKeyScanTileState<ValueT, KeyT, true>
         d_tile_descriptors(NULL)
     {}
 
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     /*num_tiles*/,                      ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    /**
+     * @brief Initializer
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size
+     *   is written to \p temp_storage_bytes and no work is done.
+     *
+     * @param[in] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
+     */
+    __host__ __device__ __forceinline__ cudaError_t Init(int /*num_tiles*/,
+                                                         void *d_temp_storage,
+                                                         size_t /*temp_storage_bytes*/)
     {
-        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        d_tile_descriptors = reinterpret_cast<TxnWord *>(d_temp_storage);
         return cudaSuccess;
     }
 
-
     /**
-     * Compute device memory needed for tile status
+     * @brief Compute device memory needed for tile status
+     *
+     * @param[in] num_tiles
+     *   Number of tiles
+     *
+     * @param[out] temp_storage_bytes
+     *   Size in bytes of \t d_temp_storage allocation
      */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    __host__ __device__ __forceinline__ static cudaError_t
+    AllocationSize(int num_tiles, size_t &temp_storage_bytes)
     {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);       // bytes needed for tile status descriptors
+        // bytes needed for tile status descriptors
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TxnWord);
         return cudaSuccess;
     }
 
-
     /**
      * Initialize (from device)
      */
@@ -1096,12 +1150,12 @@ struct TilePrefixCallbackOp
     typedef typename ScanTileStateT::StatusWord StatusWord;
 
     // Fields
-    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
-    ScanTileStateT&             tile_status;        ///< Interface to tile status
-    ScanOpT                     scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+    _TempStorage &temp_storage;  ///< Reference to a warp-reduction instance
+    ScanTileStateT &tile_status; ///< Interface to tile status
+    ScanOpT scan_op;             ///< Binary scan operator
+    int tile_idx;                ///< The current tile index
+    T exclusive_prefix;          ///< Exclusive prefix for the tile
+    T inclusive_prefix;          ///< Inclusive prefix for the tile
 
     // Constructs prefix functor for a given tile index.
     // Precondition: thread blocks processing all of the predecessor tiles were scheduled.
@@ -1123,14 +1177,23 @@ struct TilePrefixCallbackOp
         : TilePrefixCallbackOp(tile_status, temp_storage, scan_op, blockIdx.x)
     {}
 
-    // Block until all predecessors within the warp-wide window have non-invalid status
+    /**
+     * @brief Block until all predecessors within the warp-wide window have non-invalid status
+     *
+     * @param predecessor_idx
+     *   Preceding tile index to inspect
+     *
+     * @param[out] predecessor_status
+     *   Preceding tile status
+     *
+     * @param[out] window_aggregate
+     *   Relevant partial reduction from this window of preceding tiles
+     */
     template <class DelayT = detail::default_delay_t<T>>
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate,      ///< [out] Relevant partial reduction from this window of preceding tiles
-        DelayT      delay = {})
+    __device__ __forceinline__ void ProcessWindow(int predecessor_idx,
+                                                  StatusWord &predecessor_status,
+                                                  T &window_aggregate,
+                                                  DelayT delay = {})
     {
         T value;
         tile_status.WaitForValid(predecessor_idx, predecessor_status, value, delay);
diff --git a/cub/cub/block/block_adjacent_difference.cuh b/cub/cub/block/block_adjacent_difference.cuh
index b24b4bf5c5d..9dd1e096c1a 100644
--- a/cub/cub/block/block_adjacent_difference.cuh
+++ b/cub/cub/block/block_adjacent_difference.cuh
@@ -1069,18 +1069,26 @@ public:
     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     *
+     * @param[out] output
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] preds
+     *   Calling thread's predecessor items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
-        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],     ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             T (&preds)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op)
     {
         // Share last item
         temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -1103,18 +1111,31 @@ public:
     }
 
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     *
+     * @param[out] output
+     *   Calling thread's discontinuity result
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] preds
+     *   Calling thread's predecessor items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
      */
-    template <int             ITEMS_PER_THREAD,
-              typename        FlagT,
-              typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(
-        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],     ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)          ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             T (&preds)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op,
+                                                             T tile_predecessor_item)
     {
         // Share last item
         temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -1135,51 +1156,71 @@ public:
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     *
+     * @param[out] output
+     *   Calling thread's discontinuity result
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <int ITEMS_PER_THREAD,
-              typename FlagT,
-              typename FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void
-    FlagHeads(FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
-              T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-              FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op)
     {
         T preds[ITEMS_PER_THREAD];
         FlagHeads(output, input, preds, flag_op);
     }
 
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeads
+     *              APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft instead.
+     *
+     * @param[out] output
+     *   Calling thread's discontinuity result
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
      */
-    template <int ITEMS_PER_THREAD,
-              typename FlagT,
-              typename FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void
-    FlagHeads(FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
-              T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-              FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
-              T               tile_predecessor_item)          ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagHeads(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op,
+                                                             T tile_predecessor_item)
     {
         T preds[ITEMS_PER_THREAD];
         FlagHeads(output, input, preds, flag_op, tile_predecessor_item);
     }
 
-
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param output
+     *   [out] Calling thread's discontinuity result
+     *
+     * @param input
+     *   [in] Calling thread's input items
+     *
+     * @param flag_op
+     *   [in] Binary boolean flag predicate
      */
-    template <
-      int             ITEMS_PER_THREAD,
-      typename        FlagT,
-      typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
-        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        FlagOp          flag_op)                        ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op)
     {
         // Share first item
         temp_storage.first_items[linear_tid] = input[0];
@@ -1199,20 +1240,29 @@ public:
         Iterate::FlagTails(linear_tid, output, input, flag_op);
     }
 
-
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param[out] output
+     *   Calling thread's discontinuity result
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
+     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
      */
-    template <
-      int             ITEMS_PER_THREAD,
-      typename        FlagT,
-      typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(
-        FlagT           (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity result
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        FlagOp          flag_op,                        ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)            ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void FlagTails(FlagT (&output)[ITEMS_PER_THREAD],
+                                                             T (&input)[ITEMS_PER_THREAD],
+                                                             FlagOp flag_op,
+                                                             T tile_successor_item)
     {
         // Share first item
         temp_storage.first_items[linear_tid] = input[0];
@@ -1234,21 +1284,29 @@ public:
         Iterate::FlagTails(linear_tid, output, input, flag_op);
     }
 
-
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
-     * cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     *             cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-      int             ITEMS_PER_THREAD,
-      typename        FlagT,
-      typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                      T (&input)[ITEMS_PER_THREAD],
+                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -1290,22 +1348,35 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
-     * cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     *             cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
+     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-      int             ITEMS_PER_THREAD,
-      typename        FlagT,
-      typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                      T tile_successor_item,
+                      T (&input)[ITEMS_PER_THREAD],
+                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -1349,20 +1420,33 @@ public:
     }
 
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
-     * cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     *             cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-      int             ITEMS_PER_THREAD,
-      typename        FlagT,
-      typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                      T tile_predecessor_item,
+                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                      T (&input)[ITEMS_PER_THREAD],
+                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -1399,23 +1483,40 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
-     * APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
-     * cub::BlockAdjacentDifference::SubtractRight instead.
+     * @deprecated [Since 1.14.0] The cub::BlockAdjacentDifference::FlagHeadsAndTails
+     *             APIs are deprecated. Use cub::BlockAdjacentDifference::SubtractLeft or
+     *             cub::BlockAdjacentDifference::SubtractRight instead.
+     *
+     * @param head_flags
+     *   [out] Calling thread's discontinuity head_flags
+     *
+     * @param tile_predecessor_item
+     *   [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile
+     *   item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+     *
+     * @param tail_flags
+     *   [out] Calling thread's discontinuity tail_flags
+     *
+     * @param tile_successor_item
+     *   [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to
+     *   compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+     *
+     * @param input
+     *   [in] Calling thread's input items
+     *
+     * @param flag_op
+     *   [in] Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    CUB_DEPRECATED __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    CUB_DEPRECATED __device__ __forceinline__ void
+    FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                      T tile_predecessor_item,
+                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                      T tile_successor_item,
+                      T (&input)[ITEMS_PER_THREAD],
+                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
diff --git a/cub/cub/block/block_discontinuity.cuh b/cub/cub/block/block_discontinuity.cuh
index b94e125e93c..af09e6d99c5 100644
--- a/cub/cub/block/block_discontinuity.cuh
+++ b/cub/cub/block/block_discontinuity.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for
+ * flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -47,32 +48,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
+ * @brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for
+ *        flagging discontinuities within an ordered set of items partitioned across a CUDA thread
+ *        block. ![](discont_logo.png)
  *
- * \tparam T                The data type to be flagged.
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
+ * @ingroup BlockModule
  *
- * \par Overview
+ * @tparam T
+ *   The data type to be flagged.
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
  *   that differ from their predecessors (or successors).  For example, head flags are convenient
  *   for demarcating disjoint data segments as part of a segmented scan or reduction.
  * - \blocked
  *
- * \par Performance Considerations
+ * @par Performance Considerations
  * - \granularity
  *
- * \par A Simple Example
+ * @par A Simple Example
  * \blockcollective{BlockDiscontinuity}
- * \par
+ * @par
  * The code snippet below illustrates the head flagging of 512 integer items that
  * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
  * where each thread owns 4 consecutive items.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -91,21 +104,22 @@ CUB_NAMESPACE_BEGIN
  *     int head_flags[4];
  *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
  *
- * \endcode
- * \par
+ * @endcode
+ * @par
  * Suppose the set of input \p thread_data across the block of threads is
  * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
  * The corresponding output \p head_flags in those threads will be
  * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
  *
- * \par Performance Considerations
+ * @par Performance Considerations
  * - Incurs zero bank conflicts for most types
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ * <a
+ * href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
  *
  * This example can be easily adapted to the storage required by BlockDiscontinuity.
  */
@@ -176,17 +190,27 @@ private:
     /// Templated unrolling of item comparison (inductive case)
     struct Iterate
     {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        /**
+         * @brief Head flags
+         *
+         * @param[out] flags
+         *   Calling thread's discontinuity head_flags
+         *
+         * @param[in] input
+         *   Calling thread's input items
+         *
+         * @param[out] preds
+         *   Calling thread's predecessor items
+         *
+         * @param[in] flag_op
+         *   Binary boolean flag predicate
+         */
+        template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+        static __device__ __forceinline__ void FlagHeads(int linear_tid,
+                                                         FlagT (&flags)[ITEMS_PER_THREAD],
+                                                         T (&input)[ITEMS_PER_THREAD],
+                                                         T (&preds)[ITEMS_PER_THREAD],
+                                                         FlagOp flag_op)
         {
             #pragma unroll
             for (int i = 1; i < ITEMS_PER_THREAD; ++i) {
@@ -199,16 +223,23 @@ private:
             }
         }
 
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        /**
+         * @brief Tail flags
+         *
+         * @param[out] flags
+         *   Calling thread's discontinuity head_flags
+         *
+         * @param[in] input
+         *   Calling thread's input items
+         *
+         * @param[in] flag_op
+         *   Binary boolean flag predicate
+         */
+        template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+        static __device__ __forceinline__ void FlagTails(int linear_tid,
+                                                         FlagT (&flags)[ITEMS_PER_THREAD],
+                                                         T (&input)[ITEMS_PER_THREAD],
+                                                         FlagOp flag_op)
         {
             #pragma unroll
             for (int i = 0; i < ITEMS_PER_THREAD - 1; ++i) {
@@ -235,17 +266,18 @@ private:
 
 public:
 
-    /// \smemstorage{BlockDiscontinuity}
+    /// @smemstorage{BlockDiscontinuity}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary
+     *        storage.
      */
     __device__ __forceinline__ BlockDiscontinuity()
     :
@@ -253,18 +285,17 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockDiscontinuity(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     //@}  end member group
     /******************************************************************//**
      * \name Head flag operations
@@ -274,15 +305,24 @@ public:
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    /**
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] preds
+     *   Calling thread's predecessor items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              T (&preds)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op)
     {
         // Share last item
         temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -304,16 +344,29 @@ public:
         Iterate::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
     }
 
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    /**
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] preds
+     *   Calling thread's predecessor items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+     */
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              T (&preds)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op,
+                                              T tile_predecessor_item)
     {
         // Share last item
         temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
@@ -333,11 +386,11 @@ public:
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
     /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     * @brief Sets head flags indicating discontinuities between items partitioned across the thread
+     *        block, for which the first item has no reference and is always flagged.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
@@ -348,12 +401,12 @@ public:
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -372,35 +425,49 @@ public:
      *     int head_flags[4];
      *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
+     * @endcode
+     * @par
      * Suppose the set of input \p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
      * The corresponding output \p head_flags in those threads will be
      * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
+     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank
+     *   of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags 
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input 
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op 
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op)
     {
         T preds[ITEMS_PER_THREAD];
         FlagHeads(head_flags, input, preds, flag_op);
     }
 
-
     /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets head flags indicating discontinuities between items partitioned across the thread
+     *        block.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
@@ -412,12 +479,12 @@ public:
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -441,26 +508,44 @@ public:
      *     BlockDiscontinuity(temp_storage).FlagHeads(
      *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
      *
-     * \endcode
-     * \par
+     * @endcode
+     * @par
      * Suppose the set of input \p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those
+     * threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>,
+     *   and returning \p true if a discontinuity exists between \p a and \p b,
+     *   otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeads(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op,
+                                              T tile_predecessor_item)
     {
         T preds[ITEMS_PER_THREAD];
         FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
@@ -470,15 +555,15 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Tail flag operations
+     * @name Tail flag operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
+     *        block, for which the last item has no reference and is always flagged.
      *
-     * \par
+     * @par
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
@@ -486,16 +571,16 @@ public:
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -514,25 +599,39 @@ public:
      *     int tail_flags[4];
      *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
+     * The corresponding output @p tail_flags in those threads will be
      * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
+     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the
+     *   rank of b in the aggregate tile of data.
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op)
     {
         // Share first item
         temp_storage.first_items[linear_tid] = input[0];
@@ -552,29 +651,29 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets tail flags indicating discontinuities between items partitioned across the thread
+     *        block.
      *
-     * \par
+     * @par
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
+     *   returns @p true (where <em>next-item</em> is either the next item
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
+     *   against @p tile_successor_item.
      * - \blocked
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -598,26 +697,45 @@ public:
      *     BlockDiscontinuity(temp_storage).FlagTails(
      *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     * and that @p tile_successor_item is @p 125.  The corresponding output @p tail_flags in those
+     * threads will be <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
+     *   if a discontinuity exists between @p a and @p b, otherwise @p false.  @p b_index is the
+     *   rank of b in the aggregate tile of data.
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to
+     *   compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagTails(FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                              T (&input)[ITEMS_PER_THREAD],
+                                              FlagOp flag_op,
+                                              T tile_successor_item)
     {
         // Share first item
         temp_storage.first_items[linear_tid] = input[0];
@@ -642,25 +760,25 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Head & tail flag operations
+     * @name Head & tail flag operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
+     *        across the thread block.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   returns @p true (where <em>previous-item</em> is either the preceding item
      *   in the same thread or the last item in the previous thread).
      * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
+     *   returns @p true (where <em>next-item</em> is either the next item
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
@@ -668,12 +786,12 @@ public:
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -694,28 +812,45 @@ public:
      *     BlockDiscontinuity(temp_storage).FlagTails(
      *         head_flags, tail_flags, thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * and that the tile_successor_item is @p 125.  The corresponding output @p head_flags
      * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
+     * and the corresponding output @p tail_flags in those threads will be
      * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true
+     *   if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the
+     *   rank of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                                      T (&input)[ITEMS_PER_THREAD],
+                                                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -757,35 +892,35 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
+     *        across the thread block.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   returns @p true (where <em>previous-item</em> is either the preceding item
      *   in the same thread or the last item in the previous thread).
      * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
+     *   returns @p true (where <em>next-item</em> is either the next item
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_predecessor_item.
+     *   against @p tile_predecessor_item.
      * - \blocked
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -810,29 +945,51 @@ public:
      *     BlockDiscontinuity(temp_storage).FlagTails(
      *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * and that the tile_successor_item is @p 125.  The corresponding output @p head_flags
      * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
+     * and the corresponding output @p tail_flags in those threads will be
      * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
+     *   if a discontinuity exists between @p a and @p b, otherwise @p false.  @p b_index is the
+     *   rank of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
+     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                                      T tile_successor_item,
+                                                      T (&input)[ITEMS_PER_THREAD],
+                                                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -875,22 +1032,22 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
+     *        across the thread block.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   returns @p true (where <em>previous-item</em> is either the preceding item
      *   in the same thread or the last item in the previous thread).
      * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
+     *   against @p tile_predecessor_item.
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
+     *   returns @p true (where <em>next-item</em> is either the next item
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
@@ -898,12 +1055,12 @@ public:
      * - \granularity
      * - \smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -933,30 +1090,51 @@ public:
      *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
      *         thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * that the @p tile_predecessor_item is @p 0, and that the
+     * @p tile_successor_item is @p 125.  The corresponding output @p head_flags
      * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
+     * and the corresponding output @p tail_flags in those threads will be
      * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
+     *   if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
+     *   of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                                      T tile_predecessor_item,
+                                                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                                      T (&input)[ITEMS_PER_THREAD],
+                                                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
@@ -993,36 +1171,36 @@ public:
         Iterate::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
-
     /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     * @brief Sets both head and tail flags indicating discontinuities between items partitioned
+     *        across the thread block.
      *
-     * \par
+     * @par
      * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   returns @p true (where <em>previous-item</em> is either the preceding item
      *   in the same thread or the last item in the previous thread).
      * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
+     *   against @p tile_predecessor_item.
      * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
      *   <tt>input<sub><em>i</em></sub></tt> when
      *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
+     *   returns @p true (where <em>next-item</em> is either the next item
      *   in the same thread or the first item in the next thread).
      * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
      *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     *   against @p tile_successor_item.
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1052,31 +1230,57 @@ public:
      *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
      *         thread_data, cub::Inequality());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * that the @p tile_predecessor_item is @p 0, and that the
+     * @p tile_successor_item is @p 125.  The corresponding output @p head_flags
      * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
+     * and the corresponding output @p tail_flags in those threads will be
      * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam FlagT
+     *   <b>[inferred]</b> The flag type (must be an integer type)
+     *
+     * @tparam FlagOp
+     *   <b>[inferred]</b> Binary predicate functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt> or member
+     *   <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning @p true
+     *   if a discontinuity exists between @p a and @p b, otherwise @p false. @p b_index is the rank
+     *   of b in the aggregate tile of data.
+     *
+     * @param[out] head_flags
+     *   Calling thread's discontinuity head_flags
+     *
+     * @param[in] tile_predecessor_item
+     *   <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item
+     *   (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+     *
+     * @param[out] tail_flags
+     *   Calling thread's discontinuity tail_flags
+     *
+     * @param[in] tile_successor_item
+     *   <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare
+     *   the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from
+     *   <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[in] flag_op
+     *   Binary boolean flag predicate
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    template <int ITEMS_PER_THREAD, typename FlagT, typename FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(FlagT (&head_flags)[ITEMS_PER_THREAD],
+                                                      T tile_predecessor_item,
+                                                      FlagT (&tail_flags)[ITEMS_PER_THREAD],
+                                                      T tile_successor_item,
+                                                      T (&input)[ITEMS_PER_THREAD],
+                                                      FlagOp flag_op)
     {
         // Share first and last items
         temp_storage.first_items[linear_tid] = input[0];
diff --git a/cub/cub/block/block_exchange.cuh b/cub/cub/block/block_exchange.cuh
index d6a7cfbbbe8..259027dbc61 100644
--- a/cub/cub/block/block_exchange.cuh
+++ b/cub/cub/block/block_exchange.cuh
@@ -202,15 +202,20 @@ private:
         return private_storage;
     }
 
-
     /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
+     *        arrangement. Specialized for no timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -232,15 +237,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
+     *        arrangement.  Specialized for warp-timeslicing.
+     *
+     * @param[in] input_items 
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items 
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
+    __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     Int2Type<true> /*time_slicing*/)
     {
         InputT temp_items[ITEMS_PER_THREAD];
 
@@ -293,15 +303,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
+     *        arrangement. Specialized for no timeslicing
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                         Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -324,13 +339,19 @@ private:
     }
 
     /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
+     *        arrangement. Specialized for warp-timeslicing
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
+    __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                         Int2Type<true> /*time_slicing*/)
     {
         if (warp_id == 0)
         {
@@ -383,15 +404,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
+     *        arrangement. Specialized for no timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -414,15 +440,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
+     *        arrangement. Specialized for warp-timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
+    __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     Int2Type<true> /*time_slicing*/)
     {
         // Warp time-slicing
         InputT temp_items[ITEMS_PER_THREAD];
@@ -477,15 +508,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
+     *        arrangement. Specialized for no timeslicing
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                         Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -508,15 +544,20 @@ private:
         }
     }
 
-
     /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
+     *        arrangement. Specialized for warp-timeslicing
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  /*time_slicing*/)
+    __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                         Int2Type<true> /*time_slicing*/)
     {
         #pragma unroll
         for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
@@ -547,16 +588,24 @@ private:
         }
     }
 
-
     /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized
+     * for no timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD],
+                                                     Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -579,14 +628,23 @@ private:
     }
 
     /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement. Specialized
+     *        for warp-timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT          (&input_items)[ITEMS_PER_THREAD],     ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],    ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         ranks[ITEMS_PER_THREAD],              ///< [in] Corresponding scatter ranks
-        Int2Type<true>  /*time_slicing*/)
+    __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT ranks[ITEMS_PER_THREAD],
+                                                     Int2Type<true> /*time_slicing*/)
     {
         InputT temp_items[ITEMS_PER_THREAD];
 
@@ -631,16 +689,24 @@ private:
         }
     }
 
-
     /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized
+     *        for no timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        Int2Type<false> /*time_slicing*/)
+    __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD],
+                                                     Int2Type<false> /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -662,16 +728,24 @@ private:
         }
     }
 
-
     /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement. Specialized
+     *        for warp-timeslicing.
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT          (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OutputT         (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        OffsetT         (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        Int2Type<true> /*time_slicing*/)
+    __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD],
+                                                     Int2Type<true> /*time_slicing*/)
     {
         InputT temp_items[ITEMS_PER_THREAD];
 
@@ -728,12 +802,12 @@ private:
 public:
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockExchange()
     :
@@ -744,38 +818,39 @@ public:
         warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        lane_id(LaneId()),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    __device__ __forceinline__ BlockExchange(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+        , lane_id(LaneId())
+        , warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS)
+        , warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Structured exchanges
+     * @name Structured exchanges
      *********************************************************************/
     //@{
 
     /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     * @brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em>
+     *        arrangement.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -793,34 +868,38 @@ public:
      *     // Collectively exchange data into a blocked arrangement across threads
      *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
-     * The corresponding output \p thread_data in those threads will be
+     * @endcode
+     * @par
+     * Suppose the set of striped input @p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from
+     * device-accessible memory. The corresponding output @p thread_data in those threads will be
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
      *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    __device__ __forceinline__ void StripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD])
     {
         StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-
     /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em>
+     *        arrangement.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
      * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -841,36 +920,40 @@ public:
      *     // Store data striped across block threads into an ordered tile
      *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of blocked input @p thread_data across the block of threads is
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
+     * The corresponding output @p thread_data in those threads will be
      * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
      * preparation for storing to device-accessible memory.
      *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    __device__ __forceinline__ void BlockedToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD])
     {
         BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-
-
     /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     * @brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em>
+     *        arrangement.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
      *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
+     * @par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked"
+     * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+     * items.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -888,37 +971,41 @@ public:
      *     // Collectively exchange data into a blocked arrangement across threads
      *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of warp-striped input @p thread_data across the block of threads is
      * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
      * after loading from device-accessible memory.  (The first 128 items are striped across
      * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
+     * The corresponding output @p thread_data in those threads will be
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
      *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    __device__ __forceinline__ void WarpStripedToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD])
     {
         WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
 
-
-
     /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     * @brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em>
+     * arrangement.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
      *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
+     * @par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped"
+     * arrangement of 512 integer items partitioned across 128 threads where each thread owns 4
+     * items.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -939,20 +1026,25 @@ public:
      *     // Store data striped across warp threads into an ordered tile
      *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of blocked input @p thread_data across the block of threads is
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
+     * The corresponding output @p thread_data in those threads will be
      * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped
+     * across the first warp of 32 threads, the second 128 items are striped across the second warp,
+     * etc.)
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      */
     template <typename OutputT>
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      (&input_items)[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    __device__ __forceinline__ void BlockedToWarpStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                         OutputT (&output_items)[ITEMS_PER_THREAD])
     {
         BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
     }
@@ -961,62 +1053,86 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Scatter exchanges
+     * @name Scatter exchanges
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     * @brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
      *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * @tparam OffsetT
+     *   <b>[inferred]</b> Signed integer type for local offsets
+     *
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void ScatterToBlocked(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
 
-
-
     /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * @par
+     * - @smemreuse
      *
-     * \par
-     * - \smemreuse
+     * @tparam OffsetT
+     *   <b>[inferred]</b> Signed integer type for local offsets
      *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void ScatterToStriped(InputT (&input_items)[ITEMS_PER_THREAD],
+                                                     OutputT (&output_items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
     }
 
-
-
     /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     * @brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *        Items with rank -1 are not exchanged.
+     *
+     * @par
+     * - @smemreuse
+     *
+     * @tparam OffsetT
+     *   <b>[inferred]</b> Signed integer type for local offsets
      *
-     * \par
-     * - \smemreuse
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
      *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
      */
     template <typename OutputT, typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void
+    ScatterToStripedGuarded(InputT (&input_items)[ITEMS_PER_THREAD],
+                            OutputT (&output_items)[ITEMS_PER_THREAD],
+                            OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -1038,24 +1154,36 @@ public:
         }
     }
 
-
-
-
     /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     * @brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * @par
+     * - @smemreuse
+     *
+     * @tparam OffsetT
+     *   <b>[inferred]</b> Signed integer type for local offsets
      *
-     * \par
-     * - \smemreuse
+     * @tparam ValidFlag
+     *   <b>[inferred]</b> FlagT type denoting which items are valid
      *
-     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     * @param[in] input_items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[out] output_items
+     *   Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
+     *
+     * @param[in] is_valid
+     *   Corresponding flag denoting item validity
      */
     template <typename OutputT, typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      (&input_items)[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OutputT     (&output_items)[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
-        ValidFlag   (&is_valid)[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    __device__ __forceinline__ void
+    ScatterToStripedFlagged(InputT (&input_items)[ITEMS_PER_THREAD],
+                            OutputT (&output_items)[ITEMS_PER_THREAD],
+                            OffsetT (&ranks)[ITEMS_PER_THREAD],
+                            ValidFlag (&is_valid)[ITEMS_PER_THREAD])
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -1084,60 +1212,98 @@ public:
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-
-    __device__ __forceinline__ void StripedToBlocked(
-        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     */
+    __device__ __forceinline__ void StripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
     {
         StripedToBlocked(items, items);
     }
 
-    __device__ __forceinline__ void BlockedToStriped(
-        InputT      (&items)[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     */
+    __device__ __forceinline__ void BlockedToStriped(InputT (&items)[ITEMS_PER_THREAD])
     {
         BlockedToStriped(items, items);
     }
 
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     */
+    __device__ __forceinline__ void WarpStripedToBlocked(InputT (&items)[ITEMS_PER_THREAD])
     {
         WarpStripedToBlocked(items, items);
     }
 
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        InputT      (&items)[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     */
+    __device__ __forceinline__ void BlockedToWarpStriped(InputT (&items)[ITEMS_PER_THREAD])
     {
         BlockedToWarpStriped(items, items);
     }
 
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
+     */
     template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToBlocked(
-        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void ScatterToBlocked(InputT (&items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         ScatterToBlocked(items, items, ranks);
     }
 
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
+     */
     template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStriped(
-        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD],
+                                                     OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         ScatterToStriped(items, items, ranks);
     }
 
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
+     */
     template <typename OffsetT>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        InputT      (&items)[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    __device__ __forceinline__ void ScatterToStripedGuarded(InputT (&items)[ITEMS_PER_THREAD],
+                                                            OffsetT (&ranks)[ITEMS_PER_THREAD])
     {
         ScatterToStripedGuarded(items, items, ranks);
     }
 
+    /**
+     * @param[in-out] items
+     *   Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+     *
+     * @param[in] ranks
+     *   Corresponding scatter ranks
+     *
+     * @param[in] is_valid
+     *   Corresponding flag denoting item validity
+     */
     template <typename OffsetT, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStripedFlagged(
-        InputT      (&items)[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        OffsetT     (&ranks)[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag   (&is_valid)[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    __device__ __forceinline__ void ScatterToStripedFlagged(InputT (&items)[ITEMS_PER_THREAD],
+                                                            OffsetT (&ranks)[ITEMS_PER_THREAD],
+                                                            ValidFlag (&is_valid)[ITEMS_PER_THREAD])
     {
         ScatterToStriped(items, items, ranks, is_valid);
     }
diff --git a/cub/cub/block/block_histogram.cuh b/cub/cub/block/block_histogram.cuh
index 949bd2bcb34..ee750077915 100644
--- a/cub/cub/block/block_histogram.cuh
+++ b/cub/cub/block/block_histogram.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for
+ * constructing block-wide histograms from data samples partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -53,28 +54,29 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ * @brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of
+ *        block-wide histograms.
  */
 enum BlockHistogramAlgorithm
 {
 
     /**
-     * \par Overview
+     * @par Overview
      * Sorting followed by differentiation.  Execution is comprised of two phases:
      * -# Sort the data using efficient radix sort
      * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * Delivers consistent throughput regardless of sample bin distribution.
      */
     BLOCK_HISTO_SORT,
 
 
     /**
-     * \par Overview
+     * @par Overview
      * Use atomic addition to update byte counts directly
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * Performance is strongly tied to the hardware implementation of atomic
      * addition, and may be significantly degraded for non uniformly-random
      * input distributions where many concurrent updates are likely to be
@@ -89,47 +91,68 @@ enum BlockHistogramAlgorithm
  * Block histogram
  ******************************************************************************/
 
-
 /**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
+ * @brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for
+ *        constructing block-wide histograms from data samples partitioned across a CUDA thread
+ *        block. ![](histogram_logo.png)
+ *
+ * @ingroup BlockModule
+ *
+ * @tparam T
+ *   The sample type being histogrammed (must be castable to an integer bin identifier)
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of items per thread
+ *
+ * @tparam BINS
+ *   The number bins within the histogram
  *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
+ * @tparam ALGORITHM
+ *   <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm
+ *   to use (default: cub::BLOCK_HISTO_SORT)
  *
- * \par Overview
+ * @tparam BLOCK_DIM_Y
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *   counts the number of observations that fall into each of the disjoint categories (known as
+ *   <em>bins</em>).
  * - The `T` type must be implicitly castable to an integer type.
  * - BlockHistogram expects each integral `input[i]` value to satisfy
  *   `0 <= input[i] < BINS`. Values outside of this range result in undefined
  *   behavior.
  * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref
+ *      cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly.
+ *      [More...](\ref cub::BlockHistogramAlgorithm)
  *
- * \par Performance Considerations
- * - \granularity
+ * @par Performance Considerations
+ * - @granularity
  *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
+ * @par A Simple Example
+ * @blockcollective{BlockHistogram}
+ * @par
  * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
  * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
  *
  * __global__ void ExampleKernel(...)
  * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character
+ * samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
  *
  *     // Allocate shared memory for BlockHistogram
  *     __shared__ typename BlockHistogram::TempStorage temp_storage;
@@ -144,18 +167,19 @@ enum BlockHistogramAlgorithm
  *     // Compute the block-wide histogram
  *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
  *
- * \endcode
+ * @endcode
  *
- * \par Performance and Usage Considerations
+ * @par Performance and Usage Considerations
  * - All input values must fall between [0, BINS), or behavior is undefined.
  * - The histogram output can be constructed in shared or device-accessible memory
  * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
- * <a href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
+ * <a
+ * href="../../examples/block/example_block_reduce_dyn_smem.cu">example_block_reduce_dyn_smem.cu</a>
  *
  * This example can be easily adapted to the storage required by BlockHistogram.
  */
@@ -223,17 +247,17 @@ private:
 
 public:
 
-    /// \smemstorage{BlockHistogram}
+    /// @smemstorage{BlockHistogram}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockHistogram()
     :
@@ -241,34 +265,34 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockHistogram(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Histogram operations
+     * @name Histogram operations
      *********************************************************************/
     //@{
 
 
     /**
-     * \brief Initialize the shared histogram counters to zero.
+     * @brief Initialize the shared histogram counters to zero.
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a the initialization and update of a
      * histogram of 512 integer samples that are partitioned across 128 threads
      * where each thread owns 4 samples.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -292,12 +316,13 @@ public:
      *     // Update the block-wide histogram
      *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
      *
-     * \endcode
+     * @endcode
      *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     * @tparam CounterT              
+     *   <b>[inferred]</b> Histogram counter type
      */
-    template <typename CounterT     >
-    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    template <typename CounterT>
+    __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS])
     {
         // Initialize histogram bin counts to zeros
         int histo_offset = 0;
@@ -314,25 +339,26 @@ public:
         }
     }
 
-
     /**
-     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     * @brief Constructs a block-wide histogram in shared/device-accessible memory.  
+     *        Each thread contributes an array of input elements.
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
      * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+     * character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
+     * BlockHistogram;
      *
      *     // Allocate shared memory for BlockHistogram
      *     __shared__ typename BlockHistogram::TempStorage temp_storage;
@@ -347,15 +373,20 @@ public:
      *     // Compute the block-wide histogram
      *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam CounterT
+     *   <b>[inferred]</b> Histogram counter type
+     *
+     * @param[in] items
+     *   Calling thread's input values to histogram
      *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     * @param[out] histogram
+     *   Reference to shared/device-accessible memory histogram
      */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    template <typename CounterT>
+    __device__ __forceinline__ void Histogram(T (&items)[ITEMS_PER_THREAD],
+                                              CounterT histogram[BINS])
     {
         // Initialize histogram bin counts to zeros
         InitHistogram(histogram);
@@ -366,27 +397,27 @@ public:
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
     }
 
-
-
     /**
-     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     * @brief Updates an existing block-wide histogram in shared/device-accessible memory.
+     *        Each thread composites an array of input elements.
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a the initialization and update of a
      * histogram of 512 integer samples that are partitioned across 128 threads
      * where each thread owns 4 samples.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4
+     * character samples each typedef cub::BlockHistogram<unsigned char, 128, 4, 256>
+     * BlockHistogram;
      *
      *     // Allocate shared memory for BlockHistogram
      *     __shared__ typename BlockHistogram::TempStorage temp_storage;
@@ -404,15 +435,20 @@ public:
      *     // Update the block-wide histogram
      *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam CounterT
+     *   <b>[inferred]</b> Histogram counter type
+     *
+     * @param[in] items
+     *   Calling thread's input values to histogram
      *
-     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     * @param[out] histogram
+     *   Reference to shared/device-accessible memory histogram
      */
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    template <typename CounterT>
+    __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD],
+                                              CounterT histogram[BINS])
     {
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
     }
diff --git a/cub/cub/block/block_load.cuh b/cub/cub/block/block_load.cuh
index 5af8abca9e1..3c8bb154c1c 100644
--- a/cub/cub/block/block_load.cuh
+++ b/cub/cub/block/block_load.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Operations for reading linear tiles of data into the CUDA thread block.
  */
 
@@ -52,34 +52,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIo
+ * @addtogroup UtilIo
  * @{
  */
 
 
 /******************************************************************//**
- * \name Blocked arrangement I/O (direct)
+ * @name Blocked arrangement I/O (direct)
  *********************************************************************/
 //@{
 
-
 /**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ * @brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * @blocked
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  *
- * \blocked
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD])
 {
     // Load directly in thread-blocked order
     #pragma unroll
@@ -89,25 +99,39 @@ __device__ __forceinline__ void LoadDirectBlocked(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
+ *        by range.
+ *
+ * @blocked
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
  *
- * \blocked
+ * @param[out] items
+ *   Data to load
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @param[in] valid_items
+ *   Number of valid items to load
  */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
+template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD],
+                                                  int valid_items)
 {
 
     #pragma unroll
@@ -120,27 +144,43 @@ __device__ __forceinline__ void LoadDirectBlocked(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ * @brief Load a linear segment of items into a blocked arrangement across the thread block, guarded
+ *        by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * @blocked
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
  *
- * \blocked
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @param[out] items
+ *   Data to load
+ *
+ * @param[in] valid_items
+ *   Number of valid items to load
+ *
+ * @param[in] oob_default
+ *   Default value to assign out-of-bound items
  */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD],
+                                                  int valid_items,
+                                                  DefaultT oob_default)
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -153,16 +193,22 @@ __device__ __forceinline__ void LoadDirectBlocked(
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
 /**
- * Internal implementation for load vectorization
+ * @brief Internal implementation for load vectorization
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_ptr
+ *   Input pointer for loading from
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
-    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T      *block_ptr,                 ///< [in] Input pointer for loading from
-    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <CacheLoadModifier MODIFIER, typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(int linear_tid,
+                                                                    T *block_ptr,
+                                                                    T (&items)[ITEMS_PER_THREAD])
 {
     // Biggest memory access word that T is a whole multiple of
     typedef typename UnitWord<T>::DeviceWord DeviceWord;
@@ -206,28 +252,39 @@ __device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
 /**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ * @brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * @blocked
+ *
+ * The input offset (@p block_ptr + @p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to
+ * cub::BLOCK_LOAD_DIRECT:
+ *   - @p ITEMS_PER_THREAD is odd
+ *   - The data type @p T is not a built-in primitive or CUDA vector type
+ *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
  *
- * \blocked
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
  *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * @param[in] block_ptr
+ *   Input pointer for loading from
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T   *block_ptr,                 ///< [in] Input pointer for loading from
-    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(int linear_tid,
+                                                            T *block_ptr,
+                                                            T (&items)[ITEMS_PER_THREAD])
 {
     InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
 }
@@ -235,30 +292,41 @@ __device__ __forceinline__ void LoadDirectBlockedVectorized(
 
 //@}  end member group
 /******************************************************************//**
- * \name Striped arrangement I/O (direct)
+ * @name Striped arrangement I/O (direct)
  *********************************************************************/
 //@{
 
-
 /**
- * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ * @brief Load a linear segment of items into a striped arrangement across the thread block.
  *
- * \striped
+ * @striped
  *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @tparam BLOCK_THREADS
+ *   The thread block size in threads
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD])
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -267,27 +335,42 @@ __device__ __forceinline__ void LoadDirectStriped(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
+ *        by range
+ *
+ * @striped
+ *
+ * @tparam BLOCK_THREADS
+ *   The thread block size in threads
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
  *
- * \striped
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
  *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @param[out] items
+ *   Data to load
+ *
+ * @param[in] valid_items
+ *   Number of valid items to load
  */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
+template <int BLOCK_THREADS, typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD],
+                                                  int valid_items)
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -299,29 +382,50 @@ __device__ __forceinline__ void LoadDirectStriped(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ * @brief Load a linear segment of items into a striped arrangement across the thread block, guarded
+ *        by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * @striped
+ *
+ * @tparam BLOCK_THREADS
+ *   The thread block size in threads
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  *
- * \striped
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
  *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
+ *
+ * @param[in] valid_items
+ *   Number of valid items to load
+ *
+ * @param[in] oob_default
+ *   Default value to assign out-of-bound items
  */
-template <
-    int             BLOCK_THREADS,
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+template <int BLOCK_THREADS,
+          typename InputT,
+          typename DefaultT,
+          int ITEMS_PER_THREAD,
+          typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(int linear_tid,
+                                                  InputIteratorT block_itr,
+                                                  InputT (&items)[ITEMS_PER_THREAD],
+                                                  int valid_items,
+                                                  DefaultT oob_default)
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -334,31 +438,41 @@ __device__ __forceinline__ void LoadDirectStriped(
 
 //@}  end member group
 /******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
+ * @name Warp-striped arrangement I/O (direct)
  *********************************************************************/
 //@{
 
-
 /**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ * @brief Load a linear segment of items into a warp-striped arrangement across the thread block.
  *
- * \warpstriped
+ * @warpstriped
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * The number of threads in the thread block must be a multiple of the architecture's warp size.
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
+                                                      InputIteratorT block_itr,
+                                                      InputT (&items)[ITEMS_PER_THREAD])
 {
     int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
     int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
@@ -372,28 +486,42 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ * @brief Load a linear segment of items into a warp-striped arrangement across the thread block,
+ *        guarded by range
  *
- * \warpstriped
+ * @warpstriped
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * The number of threads in the thread block must be a multiple of the architecture's warp size.
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
+ *
+ * @param[in] valid_items
+ *   Number of valid items to load
  */
-template <
-    typename        InputT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
+template <typename InputT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
+                                                      InputIteratorT block_itr,
+                                                      InputT (&items)[ITEMS_PER_THREAD],
+                                                      int valid_items)
 {
     int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
     int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
@@ -410,30 +538,46 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
     }
 }
 
-
 /**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ * @brief Load a linear segment of items into a warp-striped arrangement across the thread block,
+ *        guarded by range, with a fall-back assignment of out-of-bound elements.
  *
- * \warpstriped
+ * @warpstriped
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * The number of threads in the thread block must be a multiple of the architecture's warp size.
  *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to load.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for input \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base input iterator for loading from
+ *
+ * @param[out] items
+ *   Data to load
+ *
+ * @param[in] valid_items
+ *   Number of valid items to load
+ *
+ * @param[in] oob_default
+ *   Default value to assign out-of-bound items
  */
-template <
-    typename        InputT,
-    typename        DefaultT,
-    int             ITEMS_PER_THREAD,
-    typename        InputIteratorT>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+template <typename InputT, typename DefaultT, int ITEMS_PER_THREAD, typename InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(int linear_tid,
+                                                      InputIteratorT block_itr,
+                                                      InputT (&items)[ITEMS_PER_THREAD],
+                                                      int valid_items,
+                                                      DefaultT oob_default)
 {
     // Load directly in warp-striped order
     #pragma unroll
@@ -456,64 +600,65 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
 //-----------------------------------------------------------------------------
 
 /**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ * @brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a
+ *        linear segment of data from memory into a blocked arrangement across a CUDA thread block.
  */
 enum BlockLoadAlgorithm
 {
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
      * directly from memory.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * The utilization of memory transactions (coalescing) decreases as the
      * access stride between threads increases (i.e., the number items per thread).
      */
     BLOCK_LOAD_DIRECT,
 
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
      * directly from memory.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * The utilization of memory transactions (coalescing) doesn't depend on
      * the number of items per thread.
      */
     BLOCK_LOAD_STRIPED,
 
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
      * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
      * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high until the the
      *   access stride between threads (i.e., the number items per thread) exceeds the
      *   maximum vector load width (typically 4 items or 64B, whichever is lower).
      * - The following conditions will prevent vectorization and loading will fall
      *   back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIteratorT is not a simple pointer type
+     *   - @p ITEMS_PER_THREAD is odd
+     *   - The @p InputIteratorT is not a simple pointer type
      *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type
-     *     (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     *   - The data type @p T is not a built-in primitive or CUDA vector type
+     *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
      */
     BLOCK_LOAD_VECTORIZE,
 
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
      * efficiently from memory and then locally transposed into a
      * [<em>blocked arrangement</em>](index.html#sec5sec3).
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items loaded per thread.
      * - The local reordering incurs slightly longer latencies and throughput than the
@@ -522,16 +667,16 @@ enum BlockLoadAlgorithm
     BLOCK_LOAD_TRANSPOSE,
 
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
      * read efficiently from memory and then locally transposed into a
      * [<em>blocked arrangement</em>](index.html#sec5sec3).
      *
-     * \par Usage Considerations
+     * @par Usage Considerations
      * - BLOCK_THREADS must be a multiple of WARP_THREADS
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items loaded per thread.
      * - The local reordering incurs slightly larger latencies than the
@@ -542,18 +687,18 @@ enum BlockLoadAlgorithm
     BLOCK_LOAD_WARP_TRANSPOSE,
 
     /**
-     * \par Overview
+     * @par Overview
      *
-     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * Like @p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
      * of data is read directly from memory and then is locally transposed into a
      * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
      * requirement, only one warp's worth of shared memory is provisioned and is
      * subsequently time-sliced among warps.
      *
-     * \par Usage Considerations
+     * @par Usage Considerations
      * - BLOCK_THREADS must be a multiple of WARP_THREADS
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items loaded per thread.
      * - Provisions less shared memory temporary storage, but incurs larger
@@ -564,27 +709,49 @@ enum BlockLoadAlgorithm
 
 
 /**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
- *
- * \par Overview
+ * @brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) 
+ *        data movement methods for loading a linear segment of items from memory 
+ *        into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a 
+ *        CUDA thread block.  ![](block_load_logo.png)
+ *
+ * @ingroup BlockModule
+ *
+ * @ingroup UtilIo
+ *
+ * @tparam InputT               
+ *   The data type to read into (which must be convertible from the input iterator's value type).
+ *
+ * @tparam BLOCK_DIM_X          
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD     
+ *   The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam ALGORITHM            
+ *   <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ *
+ * @tparam WARP_TIME_SLICING    
+ *   <b>[optional]</b> Whether or not only one warp's worth of shared memory should be 
+ *   allocated and time-sliced among block-warps during any load-related data transpositions 
+ *   (versus each warp having its own storage). (default: false)
+ *
+ * @tparam BLOCK_DIM_Y          
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z          
+ *  <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH      
+ *  <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * - The BlockLoad class provides a single data movement abstraction that can be specialized
  *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
  *   performance policies for different architectures, data types, granularity sizes, etc.
  * - BlockLoad can be optionally specialized by different data movement strategies:
  *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
-*    -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *   -# <b>cub::BLOCK_LOAD_STRIPED,</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
  *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
  *      of data is read directly from memory using CUDA's built-in vectorized loads as a
@@ -600,16 +767,16 @@ enum BlockLoadAlgorithm
  *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
  * - \rowmajor
  *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
+ * @par A Simple Example
+ * @blockcollective{BlockLoad}
+ * @par
  * The code snippet below illustrates the loading of a linear
  * segment of 512 integers into a "blocked" arrangement across 128 threads where each
  * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
  * meaning memory references are efficiently coalesced using a warp-striped access
  * pattern (after which items are locally reordered among threads).
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
  *
  * __global__ void ExampleKernel(int *d_data, ...)
@@ -624,13 +791,13 @@ enum BlockLoadAlgorithm
  *     int thread_data[4];
  *     BlockLoad(temp_storage).Load(d_data, thread_data);
  *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
+ * @endcode
+ * @par
+ * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of @p thread_data across the block of threads in those threads will be
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
@@ -691,32 +858,63 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory
+        /**
+         * @brief Load a linear segment of items from memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectBlocked(linear_tid, block_itr, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         *        assignment of out-of-bound elements
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
         }
@@ -744,32 +942,63 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory
+        /**
+         * @brief Load a linear segment of items from memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         *        assignment of out-of-bound elements
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
         }
@@ -797,62 +1026,115 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        /**
+         * @brief Load a linear segment of items from memory, specialized for native pointer types
+         * (attempts vectorization)
+         *
+         * @param[in] block_ptr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        __device__ __forceinline__ void Load(InputT *block_ptr, InputT (&items)[ITEMS_PER_THREAD])
         {
             InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
         }
 
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        /**
+         * @brief Load a linear segment of items from memory, specialized for native pointer types
+         * (attempts vectorization)
+         *
+         * @param[in] block_ptr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        __device__ __forceinline__ void Load(const InputT *block_ptr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
         }
 
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        template <
-            CacheLoadModifier   MODIFIER,
-            typename            ValueType,
-            typename            OffsetT>
-        __device__ __forceinline__ void Load(
-            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        /**
+         * @brief Load a linear segment of items from memory, specialized for native pointer types
+         *        (attempts vectorization)
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
+        template <CacheLoadModifier MODIFIER, typename ValueType, typename OffsetT>
+        __device__ __forceinline__ void
+        Load(CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT> block_itr,
+             InputT (&items)[ITEMS_PER_THREAD])
         {
             InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
         }
 
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        /**
+         * @brief Load a linear segment of items from memory, specialized for opaque input iterators
+         *        (skips vectorization)
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename _InputIteratorT>
-        __device__ __forceinline__ void Load(
-            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
-            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        __device__ __forceinline__ void Load(_InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectBlocked(linear_tid, block_itr, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range (skips vectorization)
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         *        assignment of out-of-bound elements (skips vectorization)
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
         }
@@ -891,34 +1173,65 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory
+        /**
+         * @brief Load a linear segment of items from memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
             BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         * assignment of out-of-bound elements
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).StripedToBlocked(items, items);
@@ -966,35 +1279,65 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory
+        /**
+         * @brief Load a linear segment of items from memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         *        assignment of out-of-bound elements
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
@@ -1041,35 +1384,65 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Load a linear segment of items from memory
+        /**
+         * @brief Load a linear segment of items from memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD])
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
-        /// Load a linear segment of items from memory, guarded by range
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         */
         template <typename InputIteratorT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items)
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        /**
+         * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+         *        assignment of out-of-bound elements
+         *
+         * @param[in] block_itr
+         *   The thread block's base input iterator for loading from
+         *
+         * @param[out] items
+         *   Data to load
+         *
+         * @param[in] valid_items
+         *   Number of valid items to load
+         *
+         * @param[in] oob_default
+         *   Default value to assign out-of-bound items
+         */
         template <typename InputIteratorT, typename DefaultT>
-        __device__ __forceinline__ void Load(
-            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                             InputT (&items)[ITEMS_PER_THREAD],
+                                             int valid_items,
+                                             DefaultT oob_default)
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
             BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
@@ -1113,17 +1486,18 @@ private:
 
 public:
 
-    /// \smemstorage{BlockLoad}
+    /// @smemstorage{BlockLoad}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary
+     *        storage.
      */
     __device__ __forceinline__ BlockLoad()
     :
@@ -1131,15 +1505,15 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockLoad(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
@@ -1147,26 +1521,25 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Data movement
+     * @name Data movement
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Load a linear segment of items from memory.
+     * @brief Load a linear segment of items from memory.
      *
-     * \par
-     * - \blocked
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the loading of a linear
      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * thread owns 4 consecutive items.  The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
      * meaning memory references are efficiently coalesced using a warp-striped access
      * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -1181,37 +1554,40 @@ public:
      *     int thread_data[4];
      *     BlockLoad(temp_storage).Load(d_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of @p thread_data across the block of threads in those threads will be
      * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
      *
+     * @param[in] block_itr
+     *   The thread block's base input iterator for loading from
+     *
+     * @param[out] items
+     *   Data to load
      */
     template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                         InputT (&items)[ITEMS_PER_THREAD])
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
     }
 
-
     /**
-     * \brief Load a linear segment of items from memory, guarded by range.
+     * @brief Load a linear segment of items from memory, guarded by range.
      *
-     * \par
-     * - \blocked
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the guarded loading of a linear
      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * thread owns 4 consecutive items. The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
      * meaning memory references are efficiently coalesced using a warp-striped access
      * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
@@ -1226,39 +1602,46 @@ public:
      *     int thread_data[4];
      *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
      *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and @p valid_items is @p 5.
+     * The set of @p thread_data across the block of threads in those threads will be
      * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
      * being unmasked to load portions of valid data (and other items remaining unassigned).
      *
+     * @param[in] block_itr
+     *   The thread block's base input iterator for loading from
+     *
+     * @param[out] items
+     *   Data to load
+     *
+     * @param[in] valid_items
+     *   Number of valid items to load
      */
     template <typename InputIteratorT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
+    __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                         InputT (&items)[ITEMS_PER_THREAD],
+                                         int valid_items)
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
     }
 
-
     /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     * @brief Load a linear segment of items from memory, guarded by range, with a fall-back
+     *        assignment of out-of-bound elements
      *
-     * \par
-     * - \blocked
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the guarded loading of a linear
      * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * thread owns 4 consecutive items.  The load is specialized for @p BLOCK_LOAD_WARP_TRANSPOSE,
      * meaning memory references are efficiently coalesced using a warp-striped access
      * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
@@ -1273,21 +1656,31 @@ public:
      *     int thread_data[4];
      *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
      *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * @p valid_items is @p 5, and the out-of-bounds default is @p -1.
+     * The set of @p thread_data across the block of threads in those threads will be
      * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     * being unmasked to load portions of valid data (and other items are assigned @p -1)
+     *
+     * @param[in] block_itr
+     *   The thread block's base input iterator for loading from
+     *
+     * @param[out] items
+     *   Data to load
+     *
+     * @param[in] valid_items
+     *   Number of valid items to load
      *
+     * @param[in] oob_default
+     *   Default value to assign out-of-bound items
      */
     template <typename InputIteratorT, typename DefaultT>
-    __device__ __forceinline__ void Load(
-        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    __device__ __forceinline__ void Load(InputIteratorT block_itr,
+                                         InputT (&items)[ITEMS_PER_THREAD],
+                                         int valid_items,
+                                         DefaultT oob_default)
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
     }
diff --git a/cub/cub/block/block_radix_rank.cuh b/cub/cub/block/block_radix_rank.cuh
index 8495947789c..13655a5548c 100644
--- a/cub/cub/block/block_radix_rank.cuh
+++ b/cub/cub/block/block_radix_rank.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
  */
 
@@ -54,11 +54,10 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
- * keys from a single tile. Note that different ranking algorithms require different
- * initial arrangements of keys to function properly.
+ * @brief Radix ranking algorithm, the algorithm used to implement stable ranking of the
+ *        keys from a single tile. Note that different ranking algorithms require different
+ *        initial arrangements of keys to function properly.
  */
 enum RadixRankAlgorithm
 {
@@ -130,31 +129,53 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
 } // namespace detail
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
 /**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
- * \ingroup BlockModule
+ * @brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread
+ *        block.
+ *
+ * @ingroup BlockModule
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam RADIX_BITS
+ *   The number of radix bits per digit place
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam MEMOIZE_OUTER_SCAN
+ *   <b>[optional]</b> Whether or not to buffer outer raking scan
+ *   partials to incur fewer shared memory reads at the expense of higher register pressure
+ *   (default: true for architectures SM35 and newer, false otherwise).
+ *   See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
  *
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
+ * @tparam INNER_SCAN_ALGORITHM
+ *   <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default:
+ *   cub::BLOCK_SCAN_WARP_SCANS)
  *
- * \par Overview
+ * @tparam SMEM_CONFIG
+ *   <b>[optional]</b> Shared memory bank mode (default: @p cudaSharedMemBankSizeFourByte)
+ *
+ * @tparam BLOCK_DIM_Y
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *  <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH      
+ *  <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * Blah...
  * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - \blocked
+ * - @blocked
  *
- * \par Performance Considerations
- * - \granularity
+ * @par Performance Considerations
+ * - @granularity
  *
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -163,6 +184,7 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
  *   constexpr int radix_bits = 5;
  *
  *   // Specialize BlockRadixRank for a 1D block of 2 threads
+ *   // Specialize BlockRadixRank for a 1D block of 2 threads
  *   using block_radix_rank = cub::BlockRadixRank<block_threads, radix_bits>;
  *   using storage_t = typename block_radix_rank::TempStorage;
  *
@@ -178,11 +200,11 @@ struct warp_in_block_matcher_t<Bits, 0, PartialWarpId>
  *   block_radix_rank(temp_storage).RankKeys(keys, ranks, extractor);
  *
  *   ...
- * \endcode
+ * @endcode
  * Suppose the set of input `keys` across the block of threads is `{ [16,10], [9,11] }`.
  * The corresponding output `ranks` in those threads will be `{ [3,1], [0,2] }`.
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
@@ -236,7 +258,8 @@ private:
         PACKING_RATIO               = static_cast<int>(sizeof(PackedCounter) / sizeof(DigitCounter)),
         LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
 
-        LOG_COUNTER_LANES           = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),                // Always at least one lane
+        // Always at least one lane
+        LOG_COUNTER_LANES           = CUB_MAX((int(RADIX_BITS) - int(LOG_PACKING_RATIO)), 0),                
         COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
 
         // The number of packed counters per thread (plus one for padding)
@@ -414,17 +437,18 @@ private:
 
 public:
 
-    /// \smemstorage{BlockScan}
+    /// @smemstorage{BlockScan}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary
+     *        storage.
      */
     __device__ __forceinline__ BlockRadixRank()
     :
@@ -432,35 +456,40 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockRadixRank(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Raking
+     * @name Raking
      *********************************************************************/
     //@{
 
     /**
-     * \brief Rank keys.
+     * @brief Rank keys.
+     *
+     * @param[in] keys
+     *   Keys for this tile
+     *
+     * @param[out] ranks
+     *   For each key, the local rank within the tile
+     *
+     * @param[in] digit_extractor
+     *   The digit extractor
      */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD,
-        typename        DigitExtractorT>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        DigitExtractorT digit_extractor)                    ///< [in] The digit extractor
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+                                             int (&ranks)[KEYS_PER_THREAD],
+                                             DigitExtractorT digit_extractor)
     {
         static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size,
                       "DigitCounter type is too small to hold this number of keys");
@@ -515,19 +544,30 @@ public:
         }
     }
 
-
     /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+     *        provided for the corresponding thread.
+     *
+     * @param[in] keys
+     *   Keys for this tile
+     *
+     * @param[out] ranks
+     *   For each key, the local rank within the tile (out parameter)
+     *
+     * @param[in] digit_extractor
+     *   The digit extractor
+     *
+     * @param[out] exclusive_digit_prefix
+     *   The exclusive prefix sum for the digits
+     *   [(threadIdx.x * BINS_TRACKED_PER_THREAD) 
+     *                   ... 
+     *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
      */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD,
-        typename        DigitExtractorT>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+                                             int (&ranks)[KEYS_PER_THREAD],
+                                             DigitExtractorT digit_extractor,
+                                             int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
     {
         static_assert(BLOCK_THREADS * KEYS_PER_THREAD <= max_tile_size,
                       "DigitCounter type is too small to hold this number of keys");
@@ -654,36 +694,37 @@ private:
 
 public:
 
-    /// \smemstorage{BlockScan}
+    /// @smemstorage{BlockScan}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockRadixRankMatch(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockRadixRankMatch(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Raking
+     * @name Raking
      *********************************************************************/
     //@{
 
-    /** \brief Computes the count of keys for each digit value, and calls the
-     * callback with the array of key counts.
-
+    /** 
+     * @brief Computes the count of keys for each digit value, and calls the
+     *        callback with the array of key counts.
+     *
      * @tparam CountsCallback The callback type. It should implement an instance
      * overload of operator()(int (&bins)[BINS_TRACKED_PER_THREAD]), where bins
      * is an array of key counts for each digit value distributed in block
@@ -726,18 +767,25 @@ public:
     }
 
     /**
-     * \brief Rank keys.
+     * @brief Rank keys.
+     *
+     * @param[in] keys
+     *   Keys for this tile
+     *
+     * @param[out] ranks
+     *   For each key, the local rank within the tile
+     *
+     * @param[in] digit_extractor
+     *   The digit extractor
      */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD,
-        typename        DigitExtractorT,
-        typename        CountsCallback>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
-        CountsCallback  callback)
+    template <typename UnsignedBits,
+              int KEYS_PER_THREAD,
+              typename DigitExtractorT,
+              typename CountsCallback>
+    __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+                                             int (&ranks)[KEYS_PER_THREAD],
+                                             DigitExtractorT digit_extractor,
+                                             CountsCallback callback)
     {
         // Initialize shared digit counters
 
@@ -840,19 +888,33 @@ public:
     }
 
     /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+     *        provided for the corresponding thread.
+     *
+     * @param[in] keys
+     *   Keys for this tile
+     *
+     * @param[out] ranks
+     *   For each key, the local rank within the tile (out parameter)
+     *
+     * @param[in] digit_extractor
+     *   The digit extractor
+     *
+     * @param[out] exclusive_digit_prefix
+     *   The exclusive prefix sum for the digits
+     *   [(threadIdx.x * BINS_TRACKED_PER_THREAD)
+     *                   ...
+     *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
      */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD,
-        typename        DigitExtractorT,
-        typename        CountsCallback>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        DigitExtractorT digit_extractor,                    ///< [in] The digit extractor
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
-        CountsCallback callback)
+    template <typename UnsignedBits,
+              int KEYS_PER_THREAD,
+              typename DigitExtractorT,
+              typename CountsCallback>
+    __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+                                             int (&ranks)[KEYS_PER_THREAD],
+                                             DigitExtractorT digit_extractor,
+                                             int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD],
+                                             CountsCallback callback)
     {
         RankKeys(keys, ranks, digit_extractor, callback);
 
@@ -872,15 +934,24 @@ public:
         }
     }
 
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD,
-        typename        DigitExtractorT>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        DigitExtractorT digit_extractor,
-        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    /**
+     * @param[in] keys
+     *   Keys for this tile
+     *
+     * @param[out] ranks
+     *   For each key, the local rank within the tile (out parameter)
+     *
+     * @param[out] exclusive_digit_prefix
+     *   The exclusive prefix sum for the digits
+     *   [(threadIdx.x * BINS_TRACKED_PER_THREAD)
+     *                   ...
+     *    (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+     */
+    template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT>
+    __device__ __forceinline__ void RankKeys(UnsignedBits (&keys)[KEYS_PER_THREAD],
+                                             int (&ranks)[KEYS_PER_THREAD],
+                                             DigitExtractorT digit_extractor,
+                                             int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])
     {
         RankKeys(keys, ranks, digit_extractor, exclusive_digit_prefix,
                  BlockRadixRankEmptyCallback<BINS_TRACKED_PER_THREAD>());
@@ -1158,7 +1229,8 @@ struct BlockRadixRankMatchEarlyCounts
     (TempStorage& temp_storage) : temp_storage(temp_storage) {}
 
     /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     * @brief Rank keys. For the lower @p RADIX_DIGITS threads, digit counts for each digit are
+     *        provided for the corresponding thread.
      */
     template <typename UnsignedBits, int KEYS_PER_THREAD, typename DigitExtractorT,
         typename CountsCallback>
diff --git a/cub/cub/block/block_radix_sort.cuh b/cub/cub/block/block_radix_sort.cuh
index 1a98a66f646..7b1522a6743 100644
--- a/cub/cub/block/block_radix_sort.cuh
+++ b/cub/cub/block/block_radix_sort.cuh
@@ -27,11 +27,11 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix
+ * sorting of items partitioned across a CUDA thread block.
  */
 
-
 #pragma once
 
 #include "../config.cuh"
@@ -376,16 +376,35 @@ private:
         Int2Type<IS_BLOCKED>    /*is_blocked*/)
     {}
 
-    /// Sort blocked arrangement
+    /**
+     * @brief Sort blocked arrangement
+     *
+     * @param keys
+     *   Keys to sort
+     *
+     * @param values
+     *   Values to sort
+     *
+     * @param begin_bit
+     *   The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param end_bit
+     *   The past-the-end (most-significant) bit index needed for key comparison
+     *
+     * @param is_descending
+     *   Tag whether is a descending-order sort
+     *
+     * @param is_keys_only
+     *   Tag whether is keys-only sort
+     */
     template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
-    __device__ __forceinline__ void SortBlocked(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only,                       ///< Tag whether is keys-only sort
-        DecomposerT             decomposer = {})
+    __device__ __forceinline__ void SortBlocked(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                ValueT (&values)[ITEMS_PER_THREAD],
+                                                int begin_bit,
+                                                int end_bit,
+                                                Int2Type<DESCENDING> is_descending,
+                                                Int2Type<KEYS_ONLY> is_keys_only,
+                                                DecomposerT decomposer = {})
     {
         bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD] =
             reinterpret_cast<bit_ordered_type(&)[ITEMS_PER_THREAD]>(keys);
@@ -433,16 +452,35 @@ public:
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-    /// Sort blocked -> striped arrangement
+    /**
+     * @brief Sort blocked -> striped arrangement
+     *
+     * @param keys
+     *   Keys to sort
+     *
+     * @param values
+     *   Values to sort
+     *
+     * @param begin_bit
+     *   The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param end_bit
+     *   The past-the-end (most-significant) bit index needed for key comparison
+     *
+     * @param is_descending
+     *   Tag whether is a descending-order sort
+     *
+     * @param is_keys_only
+     *   Tag whether is keys-only sort
+     */
     template <int DESCENDING, int KEYS_ONLY, class DecomposerT = detail::identity_decomposer_t>
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only,                       ///< Tag whether is keys-only sort
-        DecomposerT             decomposer = {})
+    __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                         ValueT (&values)[ITEMS_PER_THREAD],
+                                                         int begin_bit,
+                                                         int end_bit,
+                                                         Int2Type<DESCENDING> is_descending,
+                                                         Int2Type<KEYS_ONLY> is_keys_only,
+                                                         DecomposerT decomposer = {})
     {
         bit_ordered_type (&unsigned_keys)[ITEMS_PER_THREAD] =
             reinterpret_cast<bit_ordered_type (&)[ITEMS_PER_THREAD]>(keys);
@@ -498,17 +536,17 @@ public:
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-    /// \smemstorage{BlockRadixSort}
+    /// @smemstorage{BlockRadixSort}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockRadixSort()
     :
@@ -516,37 +554,38 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockRadixSort(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Sorting (blocked arrangements)
+     * @name Sorting (blocked arrangements)
      *********************************************************************/
     //@{
 
     /**
-     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     * @brief Performs an ascending block-wide radix sort over a [<em>blocked
+     *        arrangement</em>](index.html#sec5sec3) of keys.
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -564,17 +603,25 @@ public:
      *     // Collectively sort the keys
      *     BlockRadixSort(temp_storage).Sort(thread_keys);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
+     * The corresponding output @p thread_keys in those threads will be
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                         int begin_bit = 0,
+                                         int end_bit   = sizeof(KeyT) * 8)
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -712,29 +759,30 @@ public:
     }
 
     /**
-     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     * @brief Performs an ascending block-wide radix sort across a
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
      *
-     * \par
+     * @par
      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
      *   more than one tile of values, simply perform a key-value sort of the keys paired
      *   with a temporary value array that enumerates the key indices.  The reordered indices
      *   can then be used as a gather-vector for exchanging other associated tile data through
      *   shared memory.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys and values that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
+     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
      *
      *     // Allocate shared memory for BlockRadixSort
      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
@@ -747,19 +795,29 @@ public:
      *     // Collectively sort the keys and values among block threads
      *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in-out] values
+     *   Values to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void Sort(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void Sort(KeyT (&keys)[ITEMS_PER_THREAD],
+                                         ValueT (&values)[ITEMS_PER_THREAD],
+                                         int begin_bit = 0,
+                                         int end_bit   = sizeof(KeyT) * 8)
     {
         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
     }
@@ -919,18 +977,19 @@ public:
     }
 
     /**
-     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     * @brief Performs a descending block-wide radix sort over a 
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -948,17 +1007,25 @@ public:
      *     // Collectively sort the keys
      *     BlockRadixSort(temp_storage).Sort(thread_keys);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
+     * The corresponding output @p thread_keys in those threads will be
      * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                   int begin_bit = 0,
+                                                   int end_bit   = sizeof(KeyT) * 8)
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -1107,29 +1174,30 @@ public:
     }
 
     /**
-     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     * @brief Performs a descending block-wide radix sort across a [<em>blocked
+     *        arrangement</em>](index.html#sec5sec3) of keys and values.
      *
-     * \par
+     * @par
      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
      *   more than one tile of values, simply perform a key-value sort of the keys paired
      *   with a temporary value array that enumerates the key indices.  The reordered indices
      *   can then be used as a gather-vector for exchanging other associated tile data through
      *   shared memory.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys and values that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
+     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
      *
      *     // Allocate shared memory for BlockRadixSort
      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
@@ -1142,19 +1210,29 @@ public:
      *     // Collectively sort the keys and values among block threads
      *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in-out] values
+     *   Values to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortDescending(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void SortDescending(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                   ValueT (&values)[ITEMS_PER_THREAD],
+                                                   int begin_bit = 0,
+                                                   int end_bit   = sizeof(KeyT) * 8)
     {
         SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
     }
@@ -1317,24 +1395,25 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
+     * @name Sorting (blocked arrangement -> striped arrangement)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     * @brief Performs an ascending radix sort across a
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a
+     *        [<em>striped arrangement</em>](index.html#sec5sec3).
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+     * threads where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1352,18 +1431,25 @@ public:
      *     // Collectively sort the keys
      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                         int begin_bit = 0,
+                                                         int end_bit   = sizeof(KeyT) * 8)
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -1514,29 +1600,31 @@ public:
     }
 
     /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     * @brief Performs an ascending radix sort across a
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them
+     *        in a [<em>striped arrangement</em>](index.html#sec5sec3).
      *
-     * \par
+     * @par
      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
      *   more than one tile of values, simply perform a key-value sort of the keys paired
      *   with a temporary value array that enumerates the key indices.  The reordered indices
      *   can then be used as a gather-vector for exchanging other associated tile data through
      *   shared memory.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+     * threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
+     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
      *
      *     // Allocate shared memory for BlockRadixSort
      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
@@ -1549,19 +1637,29 @@ public:
      *     // Collectively sort the keys and values among block threads
      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in-out] values
+     *   Values to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void SortBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                         ValueT (&values)[ITEMS_PER_THREAD],
+                                                         int begin_bit = 0,
+                                                         int end_bit   = sizeof(KeyT) * 8)
     {
         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
     }
@@ -1715,18 +1813,20 @@ public:
     }
 
     /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     * @brief Performs a descending radix sort across a
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a
+     *        [<em>striped arrangement</em>](index.html#sec5sec3).
      *
-     * \par
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+     * threads where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1744,18 +1844,25 @@ public:
      *     // Collectively sort the keys
      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                                   int begin_bit = 0,
+                                                                   int end_bit   = sizeof(KeyT) * 8)
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -1906,29 +2013,31 @@ public:
     }
 
     /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     * @brief Performs a descending radix sort across a
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values,
+     *        leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
      *
-     * \par
+     * @par
      * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
      *   more than one tile of values, simply perform a key-value sort of the keys paired
      *   with a temporary value array that enumerates the key indices.  The reordered indices
      *   can then be used as a gather-vector for exchanging other associated tile data through
      *   shared memory.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128
+     * threads where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
      *
      * __global__ void ExampleKernel(...)
      * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and
+     * values each typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
      *
      *     // Allocate shared memory for BlockRadixSort
      *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
@@ -1941,19 +2050,30 @@ public:
      *     // Collectively sort the keys and values among block threads
      *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_keys across the block of threads is
      * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
+     * corresponding output @p thread_keys in those threads will be
      * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
      *
+     * @param[in-out] keys
+     *   Keys to sort
+     *
+     * @param[in-out] values
+     *   Values to sort
+     *
+     * @param[in] begin_bit
+     *   <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
      */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    __device__ __forceinline__ void
+    SortDescendingBlockedToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                   ValueT (&values)[ITEMS_PER_THREAD],
+                                   int begin_bit = 0,
+                                   int end_bit   = sizeof(KeyT) * 8)
     {
         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
     }
@@ -2115,4 +2235,3 @@ public:
  */
 
 CUB_NAMESPACE_END
-
diff --git a/cub/cub/block/block_raking_layout.cuh b/cub/cub/block/block_raking_layout.cuh
index fbe332c9e07..0f7588d8881 100644
--- a/cub/cub/block/block_raking_layout.cuh
+++ b/cub/cub/block/block_raking_layout.cuh
@@ -27,11 +27,11 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ * @file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking
+ * across thread block data.
  */
 
-
 #pragma once
 
 #include "../config.cuh"
@@ -47,19 +47,26 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
+ * @brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking
+ *        across thread block data.    ![](raking.png)
+ *
+ * @ingroup BlockModule
  *
- * \par Overview
+ * @par Overview
  * This type facilitates a shared memory usage pattern where a block of CUDA
  * threads places elements into shared memory and then reduces the active
  * parallelism to one "raking" warp of threads for serially aggregating consecutive
  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
  * (for most data types).
  *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam LEGACY_PTX_ARCH          <b>[optional]</b> Unused.
+ * @tparam T                        
+ *   The data type to be exchanged.
+ *
+ * @tparam BLOCK_THREADS            
+ *   The thread block size in threads.
+ *
+ * @tparam LEGACY_PTX_ARCH          
+ *   <b>[optional]</b> Unused.
  */
 template <
     typename    T,
@@ -105,7 +112,7 @@ struct BlockRakingLayout
 
 
     /**
-     * \brief Shared memory storage type
+     * @brief Shared memory storage type
      */
     struct __align__(16) _TempStorage
     {
@@ -117,7 +124,7 @@ struct BlockRakingLayout
 
 
     /**
-     * \brief Returns the location for the calling thread to place data into the grid
+     * @brief Returns the location for the calling thread to place data into the grid
      */
     static __device__ __forceinline__ T* PlacementPtr(
         TempStorage &temp_storage,
@@ -138,7 +145,7 @@ struct BlockRakingLayout
 
 
     /**
-     * \brief Returns the location for the calling thread to begin sequential raking
+     * @brief Returns the location for the calling thread to begin sequential raking
      */
     static __device__ __forceinline__ T* RakingPtr(
         TempStorage &temp_storage,
diff --git a/cub/cub/block/block_reduce.cuh b/cub/cub/block/block_reduce.cuh
index 8e0814604e0..6c7a4352ee7 100644
--- a/cub/cub/block/block_reduce.cuh
+++ b/cub/cub/block/block_reduce.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing
+ * a parallel reduction of items partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -64,11 +65,11 @@ enum BlockReduceAlgorithm
 {
 
     /**
-     * \par Overview
+     * @par Overview
      * An efficient "raking" reduction algorithm that only supports commutative
      * reduction operators (true for most operations, e.g., addition).
      *
-     * \par
+     * @par
      * Execution is comprised of three phases:
      * -# Upsweep sequential reduction in registers (if threads contribute more
      *    than one input each).  Threads in warps other than the first warp place
@@ -77,11 +78,11 @@ enum BlockReduceAlgorithm
      *    warp continue to accumulate by raking across segments of shared partial reductions
      * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
      *
-     * \par
-     * \image html block_reduce.png
+     * @par
+     * @image html block_reduce.png
      * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
      *   and is preferable when the reduction operator is commutative.  This variant
      *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
@@ -93,12 +94,12 @@ enum BlockReduceAlgorithm
 
 
     /**
-     * \par Overview
+     * @par Overview
      * An efficient "raking" reduction algorithm that supports commutative
      * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
      * operators. \blocked.
      *
-     * \par
+     * @par
      * Execution is comprised of three phases:
      * -# Upsweep sequential reduction in registers (if threads contribute more
      *    than one input each).  Each thread then places the partial reduction
@@ -107,11 +108,11 @@ enum BlockReduceAlgorithm
      *    single warp rake across segments of shared partial reductions.
      * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
      *
-     * \par
-     * \image html block_reduce.png
+     * @par
+     * @image html block_reduce.png
      * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - This variant performs more communication than BLOCK_REDUCE_RAKING
      *   and is only preferable when the reduction operator is non-commutative.  This variant
      *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
@@ -123,12 +124,12 @@ enum BlockReduceAlgorithm
 
 
     /**
-     * \par Overview
+     * @par Overview
      * A quick "tiled warp-reductions" reduction algorithm that supports commutative
      * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
      * operators.
      *
-     * \par
+     * @par
      * Execution is comprised of four phases:
      * -# Upsweep sequential reduction in registers (if threads contribute more
      *    than one input each).  Each thread then places the partial reduction
@@ -138,11 +139,11 @@ enum BlockReduceAlgorithm
      * -# A propagation phase where the warp reduction outputs in each warp are
      *    updated with the aggregate from each preceding warp.
      *
-     * \par
-     * \image html block_scan_warpscans.png
+     * @par
+     * @image html block_scan_warpscans.png
      * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
      *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
      *   throughput across the GPU.  However turn-around latency may be lower and
@@ -157,43 +158,67 @@ enum BlockReduceAlgorithm
  ******************************************************************************/
 
 /**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
+ * @brief The BlockReduce class provides [<em>collective</em>](index.html#sec0)
+ *        methods for computing a parallel reduction of items partitioned across
+ *        a CUDA thread block. ![](reduce_logo.png)
  *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
+ * @ingroup BlockModule
  *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - \rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ * @tparam T
+ *   Data type being reduced
  *
- * \par Performance Considerations
- * - \granularity
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ALGORITHM
+ *   <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying
+ *   the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ *
+ * @tparam BLOCK_DIM_Y
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a>
+ * (or <em>fold</em>) uses a binary combining operator to compute a single aggregate from a list of
+ * input elements.
+ * - @rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different
+ *   latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.
+ *      An efficient "raking" reduction algorithm that only
+ *      supports commutative reduction operators.
+ *      [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.
+ *      An efficient "raking" reduction algorithm that supports commutative and
+ *      non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.
+ *      A quick "tiled warp-reductions" reduction algorithm that supports commutative and
+ *      non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * @par Performance Considerations
+ * - @granularity
  * - Very efficient (only one synchronization barrier).
  * - Incurs zero bank conflicts for most types
  * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
  *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - @p BLOCK_THREADS is a multiple of the architecture's warp size
  *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
  * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
  *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
+ * @par A Simple Example
+ * @blockcollective{BlockReduce}
+ * @par
  * The code snippet below illustrates a sum reduction of 512 integer items that
  * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
  * where each thread owns 4 consecutive items.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -211,9 +236,9 @@ enum BlockReduceAlgorithm
  *     // Compute the block-wide sum for thread0
  *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
  *
- * \endcode
+ * @endcode
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
@@ -282,17 +307,17 @@ private:
 
 public:
 
-    /// \smemstorage{BlockReduce}
+    /// @smemstorage{BlockReduce}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockReduce()
     :
@@ -300,38 +325,38 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockReduce(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Generic reductions
+     * @name Generic reductions
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
+     *        reduction functor. Each thread contributes one input element.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a max reduction of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -349,33 +374,39 @@ public:
      *     // Compute the block-wide max for thread0
      *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam ReductionOp
+     *   <b>[inferred]</b> Binary reduction functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
      *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] reduction_op
+     *   Binary reduction functor
      */
     template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor
+    __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op)
     {
         return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
     }
 
-
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
+     *        reduction functor. Each thread contributes an array of consecutive input elements.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a max reduction of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -393,37 +424,43 @@ public:
      *     // Compute the block-wide max for thread0
      *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @tparam ReductionOp
+     *   <b>[inferred]</b> Binary reduction functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] inputs
+     *   Calling thread's input segment
+     *
+     * @param[in] reduction_op
+     *   Binary reduction functor
      */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor
+    template <int ITEMS_PER_THREAD, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T (&inputs)[ITEMS_PER_THREAD], ReductionOp reduction_op)
     {
         // Reduce partials
         T partial = internal::ThreadReduce(inputs, reduction_op);
         return Reduce(partial, reduction_op);
     }
 
-
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary
+     *        reduction functor. The first @p num_valid threads each contribute one input element.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items
+     * that are partitioned across 128 threads.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(int num_valid, ...)
@@ -441,15 +478,23 @@ public:
      *     // Compute the block-wide max for thread0
      *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam ReductionOp
+     *   <b>[inferred]</b> Binary reduction functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input
      *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] reduction_op
+     *   Binary reduction functor
+     *
+     * @param[in] num_valid
+     *   Number of threads containing valid elements (may be less than BLOCK_THREADS)
      */
     template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    __device__ __forceinline__ T Reduce(T input, ReductionOp reduction_op, int num_valid)
     {
         // Determine if we skip bounds checking
         if (num_valid >= BLOCK_THREADS)
@@ -465,24 +510,24 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Summation reductions
+     * @name Summation reductions
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
+     *        as the reduction operator.  Each thread contributes one input element.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sum reduction of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -500,29 +545,32 @@ public:
      *     // Compute the block-wide sum for thread0
      *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
      *
-     * \endcode
+     * @endcode
      *
+     * @param[in] input
+     *   Calling thread's input
      */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
+    __device__ __forceinline__ T Sum(T input)
     {
         return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
     }
 
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
+     *        as the reduction operator.  Each thread contributes an array of consecutive input
+     *        elements.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a sum reduction of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -540,33 +588,37 @@ public:
      *     // Compute the block-wide sum for thread0
      *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
      *
-     * \endcode
+     * @endcode
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @param[in] inputs
+     *   Calling thread's input segment
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    __device__ __forceinline__ T Sum(T (&inputs)[ITEMS_PER_THREAD])
     {
         // Reduce partials
         T partial = internal::ThreadReduce(inputs, cub::Sum());
         return Sum(partial);
     }
 
-
     /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     * @brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+)
+     *        as the reduction operator. The first @p num_valid threads each contribute one input
+     *        element.
      *
-     * \par
+     * @par
      * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items
+     * that are partitioned across 128 threads.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
      *
      * __global__ void ExampleKernel(int num_valid, ...)
@@ -585,12 +637,15 @@ public:
      *     // Compute the block-wide sum for thread0
      *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
      *
-     * \endcode
+     * @endcode
+     *
+     * @param[in] input
+     *   Calling thread's input
      *
+     * @param[in] num_valid
+     *   Number of threads containing valid elements (may be less than BLOCK_THREADS)
      */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    __device__ __forceinline__ T Sum(T input, int num_valid)
     {
         // Determine if we skip bounds checking
         if (num_valid >= BLOCK_THREADS)
@@ -608,7 +663,7 @@ public:
 };
 
 /**
- * \example example_block_reduce.cu
+ * @example example_block_reduce.cu
  */
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/block_run_length_decode.cuh b/cub/cub/block/block_run_length_decode.cuh
index c14d36fa015..4e26f641f2f 100644
--- a/cub/cub/block/block_run_length_decode.cuh
+++ b/cub/cub/block/block_run_length_decode.cuh
@@ -47,19 +47,20 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That is, given
- * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
- * array.
- * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
- * array is runtime-dependent and potentially without any upper bound. To address this, BlockRunLengthDecode allows
- * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
- * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ * @brief The BlockRunLengthDecode class supports decoding a run-length encoded array of items. That
+ * is, given the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i]
+ * many times in the output array. Due to the nature of the run-length decoding algorithm
+ * ("decompression"), the output size of the run-length decoded array is runtime-dependent and
+ * potentially without any upper bound. To address this, BlockRunLengthDecode allows retrieving a
+ * "window" from the run-length decoded array. The window's offset can be specified and
+ * BLOCK_THREADS * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from
+ * the specified window will be returned.
  *
- * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
- * A run of length zero may not be followed by a run length that is not zero.
+ * @note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the
+ * run_lengths array). A run of length zero may not be followed by a run length that is not zero.
  *
- * \par
- * \code
+ * @par
+ * @code
  * __global__ void ExampleKernel(...)
  * {
  *   // Specialising BlockRunLengthDecode to run-length decode items of type uint64_t
@@ -107,23 +108,38 @@ CUB_NAMESPACE_BEGIN
  *     ...
  *   }
  * }
- * \endcode
- * \par
- * Suppose the set of input \p run_values across the block of threads is
+ * @endcode
+ * @par
+ * Suppose the set of input @p run_values across the block of threads is
  * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
- * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
- * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
- * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
- * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ * @p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output @p decoded_items in those threads will be 
+ * <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> 
+ * and @p relative_offsets will be
+ * <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2, 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the
+ * first iteration of the while loop.
  *
- * \tparam ItemT The data type of the items being run-length decoded
- * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
- * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
- * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
- * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
- * runs' lengths)
- * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
- * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ * @tparam ItemT
+ *   The data type of the items being run-length decoded
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam RUNS_PER_THREAD
+ *   The number of consecutive runs that each thread contributes
+ *
+ * @tparam DECODED_ITEMS_PER_THREAD
+ *   The maximum number of decoded items that each thread holds
+ *
+ * @tparam DecodedOffsetT
+ *   Type used to index into the block's decoded items (large enough to hold the sum over all the
+ *   runs' lengths)
+ *
+ * @tparam BLOCK_DIM_Y 
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z 
+ *   The thread block length in threads along the Z dimension
  */
 template <typename ItemT,
           int BLOCK_DIM_X,
@@ -240,17 +256,23 @@ public:
 
 private:
   /**
-   * \brief Returns the offset of the first value within \p input which compares greater than \p val. This version takes
-   * \p MAX_NUM_ITEMS, an upper bound of the array size, which will be used to determine the number of binary search
-   * iterations at compile time.
+   * @brief Returns the offset of the first value within @p input which compares greater than
+   * @p val. This version takes @p MAX_NUM_ITEMS, an upper bound of the array size, which will
+   * be used to determine the number of binary search iterations at compile time.
+   *
+   * @param[in] input
+   *   Input sequence
+   *
+   * @param[in] num_items
+   *   Input sequence length
+   *
+   * @param[in] val
+   *   Search key
    */
-  template <int MAX_NUM_ITEMS,
-            typename InputIteratorT,
-            typename OffsetT,
-            typename T>
-  __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input, ///< [in] Input sequence
-                                                      OffsetT num_items,    ///< [in] Input sequence length
-                                                      T val)                ///< [in] Search key
+  template <int MAX_NUM_ITEMS, typename InputIteratorT, typename OffsetT, typename T>
+  __device__ __forceinline__ OffsetT StaticUpperBound(InputIteratorT input,
+                                                      OffsetT num_items,
+                                                      T val)
   {
     OffsetT lower_bound = 0;
     OffsetT upper_bound = num_items;
diff --git a/cub/cub/block/block_scan.cuh b/cub/cub/block/block_scan.cuh
index 6629fc8b087..90f60e5ea24 100644
--- a/cub/cub/block/block_scan.cuh
+++ b/cub/cub/block/block_scan.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a
+ * parallel prefix sum/scan of items partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -54,13 +55,14 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ * @brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a
+ *        parallel prefix scan across a CUDA thread block.
  */
 enum BlockScanAlgorithm
 {
 
     /**
-     * \par Overview
+     * @par Overview
      * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
      * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
      * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
@@ -68,11 +70,11 @@ enum BlockScanAlgorithm
      * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
      * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
      *
-     * \par
-     * \image html block_scan_raking.png
+     * @par
+     * @image html block_scan_raking.png
      * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - Although this variant may suffer longer turnaround latencies when the
      *   GPU is under-occupied, it can often provide higher overall throughput
      *   across the GPU when suitably occupied.
@@ -81,7 +83,7 @@ enum BlockScanAlgorithm
 
 
     /**
-     * \par Overview
+     * @par Overview
      * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
      * the expense of higher register pressure.  Raking threads preserve their
      * "upsweep" segment of values in registers while performing warp-synchronous
@@ -91,18 +93,18 @@ enum BlockScanAlgorithm
 
 
     /**
-     * \par Overview
+     * @par Overview
      * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
      * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
      * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
      * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
      * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
      *
-     * \par
-     * \image html block_scan_warpscans.png
+     * @par
+     * @image html block_scan_warpscans.png
      * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - Although this variant may suffer lower overall throughput across the
      *   GPU because due to a heavy reliance on inefficient warpscans, it can
      *   often provide lower turnaround latencies when the GPU is under-occupied.
@@ -116,50 +118,70 @@ enum BlockScanAlgorithm
  ******************************************************************************/
 
 /**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
+ * @brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for 
+ *        computing a parallel prefix sum/scan of items partitioned across a 
+ *        CUDA thread block. ![](block_scan_logo.png)
  *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH  <b>[optional]</b> Unused.
+ * @ingroup BlockModule
  *
- * \par Overview
+ * @tparam T                
+ *   Data type being scanned
+ *
+ * @tparam BLOCK_DIM_X      
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ALGORITHM        
+ *   <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ *
+ * @tparam BLOCK_DIM_Y      
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z      
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH  
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
  *   produces an output list where each element is computed to be the reduction
  *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   connotes a prefix scan with the addition operator. The term @em inclusive indicates
  *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   The term @em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
  *   the <em>i</em><sup>th</sup> output reduction.
  * - \rowmajor
  * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  
+ *      An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. 
+ *      [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  
+ *      Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional 
+ *      register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  
+ *      A quick (low latency) "tiled warpscans" prefix scan algorithm. 
+ *      [More...](\ref cub::BlockScanAlgorithm)
  *
- * \par Performance Considerations
- * - \granularity
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * @par Performance Considerations
+ * - @granularity
+ * - Uses special instructions when applicable (e.g., warp @p SHFL)
  * - Uses synchronization-free communication between warp lanes when applicable
  * - Invokes a minimal number of minimal block-wide synchronization barriers (only
  *   one or two depending on algorithm selection)
  * - Incurs zero bank conflicts for most types
  * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
  *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - \blocksize
+ *   - @blocksize
  * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
  *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
+ * @par A Simple Example
+ * @blockcollective{BlockScan}
+ * @par
  * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
  * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
  * where each thread owns 4 consecutive items.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -177,14 +199,14 @@ enum BlockScanAlgorithm
  *     // Collectively compute the block-wide exclusive prefix sum
  *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
  *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the block of threads is
  * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output \p thread_data in those threads will be
+ * The corresponding output @p thread_data in those threads will be
  * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
@@ -265,17 +287,17 @@ private:
      ******************************************************************************/
 public:
 
-    /// \smemstorage{BlockScan}
+    /// @smemstorage{BlockScan}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockScan()
     :
@@ -283,39 +305,41 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockScan(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Exclusive prefix sum operations
+     * @name Exclusive prefix sum operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
+     *        The value of 0 is applied as the initial value, and is assigned to
+     *        @p output in <em>thread</em><sub>0</sub>.
      *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
+     * @par
+     * - @identityzero
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -333,35 +357,42 @@ public:
      *     // Collectively compute the block-wide exclusive prefix sum
      *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
+     * threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     * @param[in] input
+     *   Calling thread's input item
      *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
      */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output)
     {
         T initial_value{};
 
         ExclusiveScan(input, output, initial_value, cub::Sum());
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
+     *        The value of 0 is applied as the initial value, and is assigned to
+     *        @p output in <em>thread</em><sub>0</sub>. Also provides every thread
+     *        with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - @identityzero
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -380,43 +411,56 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
+     * threads will be <tt>0, 1, ..., 127</tt>. Furthermore the value @p 128 will
+     * be stored in @p block_aggregate for all threads.
+     *
+     * @param[in] input
+     *   Calling thread's input item
      *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &block_aggregate)
     {
         T initial_value{};
 
         ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
+     *        Instead of using 0 as the block-wide prefix, the call-back functor
+     *        @p block_prefix_callback_op is invoked by the first warp in the block,
+     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used
+     *        as the "seed" value that logically prefixes the thread block's scan inputs.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - @identityzero
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
+     *   @p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block,
+     *   however only the return value from <em>lane</em><sub>0</sub> is applied
+     *   as the block-wide prefix. Can be stateful.
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an exclusive prefix sum over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
      * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -464,19 +508,30 @@ public:
      *         // Store scanned items to output segment
      *         d_data[block_offset] = thread_data;
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
      * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
      * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
      *
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
     template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    __device__ __forceinline__ void ExclusiveSum(T input,
+                                                 T &output,
+                                                 BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
     }
@@ -484,26 +539,28 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
+     * @name Exclusive prefix sum operations (multiple data per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of consecutive
+     *        input elements. The value of 0 is applied as the initial value, and is
+     *        assigned to @p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * @par
+     * - @identityzero
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -521,39 +578,50 @@ public:
      *     // Collectively compute the block-wide exclusive prefix sum
      *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The corresponding output
+     * @p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD])
     {
         T initial_value{};
 
         ExclusiveScan(input, output, initial_value, cub::Sum());
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of consecutive
+     *        input elements. The value of 0 is applied as the initial value, and is
+     *        assigned to @p output[0] in <em>thread</em><sub>0</sub>. Also provides
+     *        every thread with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - @identityzero
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -572,19 +640,30 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The
+     * corresponding output @p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @param[in] input
+     *   Calling thread's input items
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD],
+                                                 T &block_aggregate)
     {
         // Reduce consecutive thread items in registers
         T initial_value{};
@@ -592,28 +671,38 @@ public:
         ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an exclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of consecutive
+     *        input elements. Instead of using 0 as the block-wide prefix, the
+     *        call-back functor @p block_prefix_callback_op is invoked by the first warp
+     *        in the block, and the value returned by <em>lane</em><sub>0</sub> in that
+     *        warp is used as the "seed" value that logically prefixes the thread block's
+     *        scan inputs. Also provides every thread with the block-wide
+     *        @p block_aggregate of all inputs.
+     *
+     * @par
+     * - @identityzero
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter @p block_aggregate is the same value also returned
+     *   by the scan operation. The functor will be invoked by the first warp of threads in
+     *   the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.
+     *   Can be stateful.
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an exclusive prefix sum over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * of 512 integer items that are partitioned in a [<em>blocked
+     * arrangement</em>](index.html#sec5sec3) across 128 threads where each thread owns 4
+     * consecutive items.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -671,22 +760,34 @@ public:
      *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
      *         CTA_SYNC();
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>0, 1, 2, 3, ..., 510, 511</tt>. The output for the second segment
+     * will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD],
+                                                 BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
     }
@@ -695,24 +796,24 @@ public:
 
     //@}  end member group        // Exclusive prefix sums
     /******************************************************************//**
-     * \name Exclusive prefix scan operations
+     * @name Exclusive prefix scan operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     * @brief Computes an exclusive block-wide prefix scan using the specified binary
+     *        @p scan_op functor. Each thread contributes one input element.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -730,37 +831,51 @@ public:
      *     // Collectively compute the block-wide exclusive prefix max scan
      *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
+     * in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
      *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan (and is assigned to @p output[0] in
+     * <em>thread</em><sub>0</sub>)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an exclusive block-wide prefix scan using the specified
+     *        binary @p scan_op functor. Each thread contributes one input element.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -779,45 +894,66 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output
+     * @p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value @p 126 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan (and is assigned to
+     *   @p output[0] in <em>thread</em><sub>0</sub>)
      *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp          scan_op,            ///< [in] Binary scan functor
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, T initial_value, ScanOp scan_op, T &block_aggregate)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes one input element.
+     *        the call-back functor @p block_prefix_callback_op is invoked by the first warp
+     *        in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp
+     *        is used as the "seed" value that logically prefixes the thread block's scan
+     *        inputs. Also provides every thread with the block-wide @p block_aggregate of 
+     *        all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter @p block_aggregate
+     *   is the same value also returned by the scan operation. The functor will be invoked by the
+     *   first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix. Can be stateful.
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an exclusive prefix max scan over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
      * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -865,23 +1001,39 @@ public:
      *         // Store scanned items to output segment
      *         d_data[block_offset] = thread_data;
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>. The output for the second segment
+     * will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide
+     *   prefix to be applied to the logical input sequence.
      */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(T input,
+                                                  T &output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
     }
@@ -889,26 +1041,27 @@ public:
 
     //@}  end member group        // Inclusive prefix sums
     /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
+     * @name Exclusive prefix scan operations (multiple data per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer
+     * items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -926,24 +1079,38 @@ public:
      *     // Collectively compute the block-wide exclusive prefix max scan
      *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
+     * The corresponding output @p thread_data in those threads will be
      * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan (and is assigned to @p output[0] in
+     * <em>thread</em><sub>0</sub>)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  T initial_value,
+                                                  ScanOp scan_op)
     {
         // Reduce consecutive thread items in registers
         T thread_prefix = internal::ThreadReduce(input, scan_op);
@@ -955,22 +1122,24 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements. Also provides every thread
+     *        with the block-wide @p block_aggregate of all inputs.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -989,24 +1158,43 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output @p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param input
+     *   [in] Calling thread's input items
+     *
+     * @param output
+     *   [out] Calling thread's output items (may be aliased to @p input)
+     *
+     * @param initial_value
+     *   [in] Initial value to seed the exclusive scan
+     *   (and is assigned to @p output[0] in <em>thread</em><sub>0</sub>)
+     *
+     * @param scan_op
+     *   [in] Binary scan functor
+     *
+     * @param block_aggregate
+     *   [out] block-wide aggregate reduction of input items
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
-        ScanOp            scan_op,                      ///< [in] Binary scan functor
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  T initial_value,
+                                                  ScanOp scan_op,
+                                                  T &block_aggregate)
     {
         // Reduce consecutive thread items in registers
         T thread_prefix = internal::ThreadReduce(input, scan_op);
@@ -1018,27 +1206,33 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements. The call-back functor
+     *        @p block_prefix_callback_op is invoked by the first warp in the block,
+     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used as
+     *        the "seed" value that logically prefixes the thread block's scan inputs.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter @p block_aggregate
+     *   is the same value also returned by the scan operation. The functor will be invoked by the
+     *   first warp of threads in the block, however only the return value from
      *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an exclusive prefix max scan over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
      * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -1095,25 +1289,43 @@ public:
      *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
      *         CTA_SYNC();
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be
+     * <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param input
+     *   [in] Calling thread's input items
+     *
+     * @param output
+     *   [out] Calling thread's output items (may be aliased to @p input)
+     *
+     * @param scan_op
+     *   [in] Binary scan functor
+     *
+     * @param block_prefix_callback_op
+     *   [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for
+     *   specifying a block-wide prefix to be applied to the logical input sequence.
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         // Reduce consecutive thread items in registers
         T thread_prefix = internal::ThreadReduce(input, scan_op);
@@ -1130,77 +1342,113 @@ public:
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
 
     /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     * @name Exclusive prefix scan operations (no initial value, single datum per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        one input element. With no initial value, the output computed
+     *        for <em>thread</em><sub>0</sub> is undefined.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
      *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] scan_op
+     *   Binary scan functor
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    __device__ __forceinline__ void ExclusiveScan(T input, T &output, ScanOp scan_op)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        one input element. Also provides every thread with the block-wide
+     *        @p block_aggregate of all inputs. With no initial value, the output
+     *        computed for <em>thread</em><sub>0</sub> is undefined.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
      *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
     {
         InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
     }
 
     //@}  end member group
     /******************************************************************//**
-     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     * @name Exclusive prefix scan operations (no initial value, multiple data per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements. With no initial value, the
+     *        output computed for <em>thread</em><sub>0</sub> is undefined.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op)
     {
         // Reduce consecutive thread items in registers
         T thread_partial = internal::ThreadReduce(input, scan_op);
@@ -1212,27 +1460,44 @@ public:
         internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
     }
 
-
     /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
+     * @brief Computes an exclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements. Also provides every thread
+     *        with the block-wide @p block_aggregate of all inputs.
+     *        With no initial value, the output computed for
+     *        <em>thread</em><sub>0</sub> is undefined.
+     *
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op,
+                                                  T &block_aggregate)
     {
         // Reduce consecutive thread items in registers
         T thread_partial = internal::ThreadReduce(input, scan_op);
@@ -1249,23 +1514,23 @@ public:
 #endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
 
     /******************************************************************//**
-     * \name Inclusive prefix sum operations
+     * @name Inclusive prefix sum operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
      *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
+     * @par
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1283,32 +1548,37 @@ public:
      *     // Collectively compute the block-wide inclusive prefix sum
      *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those threads
+     * will be <tt>1, 2, ..., 128</tt>.
      *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
      */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    __device__ __forceinline__ void InclusiveSum(T input, T &output)
     {
         InclusiveScan(input, output, cub::Sum());
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
      *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
+     * @par
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1327,41 +1597,54 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>1, 1, ..., 1</tt>. The corresponding output @p thread_data in those
+     * threads will be <tt>1, 2, ..., 128</tt>. Furthermore the value @p 128 will
+     * be stored in @p block_aggregate for all threads.
+     *
+     * @param[in] input
+     *   Calling thread's input item
      *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void InclusiveSum(T input, T &output, T &block_aggregate)
     {
         InclusiveScan(input, output, cub::Sum(), block_aggregate);
     }
 
-
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes one input element.
+     *        Instead of using 0 as the block-wide prefix, the call-back functor
+     *        @p block_prefix_callback_op is invoked by the first warp in the block,
+     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is
+     *        used as the "seed" value that logically prefixes the thread block's
+     *        scan inputs. Also provides every thread with the block-wide
+     *        @p block_aggregate of all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
+     *   @p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block,
+     *   however only the return value from <em>lane</em><sub>0</sub> is applied
+     *   as the block-wide prefix. Can be stateful.
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * prefix functor to maintain a running total between block-wide scans.
+     * Each tile consists of 128 integer items that are partitioned across 128 threads.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -1409,19 +1692,30 @@ public:
      *         // Store scanned items to output segment
      *         d_data[block_offset] = thread_data;
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
      * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
      * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
      *
-     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
     template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    __device__ __forceinline__ void InclusiveSum(T input,
+                                                 T &output,
+                                                 BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
     }
@@ -1429,25 +1723,26 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
+     * @name Inclusive prefix sum operations (multiple data per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of
+     *        consecutive input elements.
      *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1465,17 +1760,25 @@ public:
      *     // Collectively compute the block-wide inclusive prefix sum
      *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>. The corresponding output
+     * @p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @param[in] input
+     *   Calling thread's input items
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD])
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -1495,21 +1798,23 @@ public:
         }
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of consecutive
+     *        input elements. Also provides every thread with the block-wide
+     *        @p block_aggregate of all inputs.
      *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1528,22 +1833,34 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
+     * corresponding output @p thread_data in those threads will be
      * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     * Furthermore the value @p 512 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD],
+                                                 T &block_aggregate)
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -1563,27 +1880,36 @@ public:
         }
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
+     * @brief Computes an inclusive block-wide prefix scan using addition (+)
+     *        as the scan operator. Each thread contributes an array of consecutive
+     *        input elements. Instead of using 0 as the block-wide prefix, the
+     *        call-back functor @p block_prefix_callback_op is invoked by the first
+     *        warp in the block, and the value returned by <em>lane</em><sub>0</sub>
+     *        in that warp is used as the "seed" value that logically prefixes the
+     *        thread block's scan inputs. Also provides every thread with the
+     *        block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
+     *   @p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block,
+     *   however only the return value from <em>lane</em><sub>0</sub> is applied
+     *   as the block-wide prefix. Can be stateful.
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an inclusive prefix sum over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * of 512 integer items that are partitioned in a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads where each thread
+     * owns 4 consecutive items.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -1640,22 +1966,34 @@ public:
      *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
      *         CTA_SYNC();
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>1, 2, 3, 4, ..., 511, 512</tt>. The output for the second segment will be
+     * <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <int ITEMS_PER_THREAD, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(T (&input)[ITEMS_PER_THREAD],
+                                                 T (&output)[ITEMS_PER_THREAD],
+                                                 BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -1678,24 +2016,25 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Inclusive prefix scan operations
+     * @name Inclusive prefix scan operations
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        one input element.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
      * are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1713,36 +2052,47 @@ public:
      *     // Collectively compute the block-wide inclusive prefix max scan
      *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
+     * in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param input
+     *   [in] Calling thread's input item
      *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param output
+     *   [out] Calling thread's output item (may be aliased to @p input)
+     *
+     * @param scan_op
+     *   [in] Binary scan functor
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    __device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        one input element. Also provides every thread with the block-wide
+     *        @p block_aggregate of all inputs.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
+     * - @rowmajor
+     * - @smemreuse
+     *
+     * @par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128
+     * integer items that are partitioned across 128 threads.
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1761,44 +2111,63 @@ public:
      *     int block_aggregate;
      *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>0, -1, 2, -3, ..., 126, -127</tt>. The corresponding output @p thread_data
+     * in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>. Furthermore the value
+     * @p 126 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
      *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        one input element. The call-back functor @p block_prefix_callback_op
+     *        is invoked by the first warp in the block, and the value returned by
+     *        <em>lane</em><sub>0</sub> in that warp is used as the "seed" value
+     *        that logically prefixes the thread block's scan inputs.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
+     *   @p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block,
+     *   however only the return value from <em>lane</em><sub>0</sub> is applied
+     *   as the block-wide prefix. Can be stateful.
      * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
+     * - @rowmajor
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an inclusive prefix max scan over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
      * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -1846,23 +2215,39 @@ public:
      *         // Store scanned items to output segment
      *         d_data[block_offset] = thread_data;
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>0, 0, 2, 2, ..., 126, 126</tt>. The output for the second segment
+     * will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(T input,
+                                                  T &output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
     }
@@ -1870,26 +2255,27 @@ public:
 
     //@}  end member group
     /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
+     * @name Inclusive prefix scan operations (multiple data per thread)
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        an array of consecutive input elements.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1907,21 +2293,33 @@ public:
      *     // Collectively compute the block-wide inclusive prefix max scan
      *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output @p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op)
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -1940,22 +2338,24 @@ public:
         }
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes
+     *        an array of consecutive input elements. Also provides every thread
+     *        with the block-wide @p block_aggregate of all inputs.
      *
-     * \par
+     * @par
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
      * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
      * where each thread owns 4 consecutive items.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * __global__ void ExampleKernel(...)
@@ -1972,27 +2372,41 @@ public:
      *
      *     // Collectively compute the block-wide inclusive prefix max scan
      *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(),
+     * block_aggregate);
      *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of input @p thread_data across the block of threads is
      * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
+     * The corresponding output @p thread_data in those threads will be
      * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     * Furthermore the value @p 510 will be stored in @p block_aggregate for all threads.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
      *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[out] block_aggregate
+     *   block-wide aggregate reduction of input items
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    template <int ITEMS_PER_THREAD, typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op,
+                                                  T &block_aggregate)
     {
         if (ITEMS_PER_THREAD == 1)
         {
@@ -2011,27 +2425,34 @@ public:
         }
     }
 
-
     /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * @brief Computes an inclusive block-wide prefix scan using the
+     *        specified binary @p scan_op functor. Each thread contributes an
+     *        array of consecutive input elements. The call-back functor
+     *        @p block_prefix_callback_op is invoked by the first warp in the block,
+     *        and the value returned by <em>lane</em><sub>0</sub> in that warp is used
+     *        as the "seed" value that logically prefixes the thread block's scan inputs.
+     *        Also provides every thread with the block-wide @p block_aggregate of all inputs.
+     *
+     * @par
+     * - The @p block_prefix_callback_op functor must implement a member function
+     *   <tt>T operator()(T block_aggregate)</tt>. The functor's input parameter
+     *   @p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block,
+     *   however only the return value from <em>lane</em><sub>0</sub> is applied
+     *   as the block-wide prefix. Can be stateful.
      * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates a single thread block that progressively
      * computes an inclusive prefix max scan over multiple "tiles" of input using a
      * prefix functor to maintain a running total between block-wide scans.  Each tile consists
      * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
      *
      * // A stateful callback functor that maintains a running prefix to be applied
@@ -2088,25 +2509,42 @@ public:
      *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
      *         CTA_SYNC();
      *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     * @endcode
+     * @par
+     * Suppose the input @p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be
+     * <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>. The output for the second
+     * segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * @tparam ITEMS_PER_THREAD
+     *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     *
+     * @tparam ScanOp
+     *   <b>[inferred]</b> Binary scan functor type having member
+     *   <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     * @tparam BlockPrefixCallbackOp
+     *   <b>[inferred]</b> Call-back functor type having member
+     *   <tt>T operator()(T block_aggregate)</tt>
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to @p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan functor
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a
+     *   block-wide prefix to be applied to the logical input sequence.
      */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    template <int ITEMS_PER_THREAD, typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(T (&input)[ITEMS_PER_THREAD],
+                                                  T (&output)[ITEMS_PER_THREAD],
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         if (ITEMS_PER_THREAD == 1)
         {
diff --git a/cub/cub/block/block_shuffle.cuh b/cub/cub/block/block_shuffle.cuh
index 4118344df7a..3a15499370e 100644
--- a/cub/cub/block/block_shuffle.cuh
+++ b/cub/cub/block/block_shuffle.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling
+ * data partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -47,16 +48,27 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
- * \ingroup BlockModule
+ * @brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) 
+ *        methods for shuffling data partitioned across a CUDA thread block.
  *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
+ * @ingroup BlockModule
  *
- * \par Overview
+ * @tparam T                    
+ *   The data type to be exchanged.
+ *
+ * @tparam BLOCK_DIM_X          
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y          
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z          
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH      
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * It is commonplace for blocks of threads to rearrange data items between
  * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
  * either (a) up to their successor or (b) down to their predecessor.
@@ -127,12 +139,13 @@ private:
 public:
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of 
+     *        shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockShuffle()
     :
@@ -140,35 +153,46 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation
+     *        as temporary storage.
+     *
+     * @param[in] temp_storage
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockShuffle(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockShuffle(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Shuffle movement
+     * @name Shuffle movement
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     * @brief Each <em>thread<sub>i</sub></em> obtains the @p input provided by
+     *        <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *        The offset @p distance may be negative.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
+     *
+     * @param[in] input
+     *   The input item from the calling thread (<em>thread<sub>i</sub></em>)
+     *
+     * @param[out] output
+     *   The @p input item from the successor (or predecessor) thread
+     *   <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to @p input).
+     *   This value is only updated for for <em>thread<sub>i</sub></em> when
+     *   0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+     *
+     * @param[in] distance
+     *   Offset distance (may be negative)
      */
-    __device__ __forceinline__ void Offset(
-        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
-        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
-        int distance = 1)           ///< [in] Offset distance (may be negative)
+    __device__ __forceinline__ void Offset(T input, T &output, int distance = 1)
     {
         temp_storage[linear_tid] = input;
 
@@ -181,17 +205,26 @@ public:
         }
     }
 
-
     /**
-     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     * @brief Each <em>thread<sub>i</sub></em> obtains the @p input
+     *        provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
      *
-     * \par
-     * - \smemreuse
+     * @par
+     * - @smemreuse
+     *
+     * @param[in] input
+     *   The calling thread's input item
+     *
+     * @param[out] output
+     *   The @p input item from thread
+     *   <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt>BLOCK_THREADS</tt></sub>
+     *   (may be aliased to @p input). This value is not updated for
+     *   <em>thread</em><sub>BLOCK_THREADS-1</sub>
+     *
+     * @param[in] distance
+     *   Offset distance (0 < @p distance < <tt>BLOCK_THREADS</tt>)
      */
-    __device__ __forceinline__ void Rotate(
-        T   input,                  ///< [in] The calling thread's input item
-        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt>BLOCK_THREADS</tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
-        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    __device__ __forceinline__ void Rotate(T input, T &output, unsigned int distance = 1)
     {
         temp_storage[linear_tid] = input;
 
@@ -204,19 +237,25 @@ public:
         output = temp_storage[offset];
     }
 
-
     /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     * @brief The thread block rotates its
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of
+     *        @p input items, shifting it up by one item.
      *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @param[in] input
+     *   The calling thread's input items
+     *
+     * @param[out] prev
+     *   The corresponding predecessor items (may be aliased to @p input).
+     *   The item @p prev[0] is not updated for <em>thread</em><sub>0</sub>.
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Up(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    __device__ __forceinline__ void Up(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
     {
         temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
 
@@ -249,19 +288,25 @@ public:
         block_suffix = temp_storage[BLOCK_THREADS - 1];
     }
 
-
     /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     * @brief The thread block rotates its
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of
+     *        @p input items, shifting it down by one item.
      *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @param[in] input
+     *   The calling thread's input items
+     *
+     * @param[out] prev
+     *   The corresponding predecessor items (may be aliased to @p input).
+     *   The value @p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD], T (&prev)[ITEMS_PER_THREAD])
     {
         temp_storage[linear_tid] = input[0];
 
@@ -275,20 +320,31 @@ public:
             prev[ITEMS_PER_THREAD - 1] = temp_storage[linear_tid + 1];
     }
 
-
     /**
-     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     * @brief The thread block rotates its
+     *        [<em>blocked arrangement</em>](index.html#sec5sec3) of input items,
+     *        shifting it down by one item. All threads receive @p input[0]
+     *        provided by <em>thread</em><sub><tt>0</tt></sub>.
      *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @granularity
+     * - @smemreuse
+     *
+     * @param[in] input
+     *   The calling thread's input items
+     *
+     * @param[out] prev
+     *   The corresponding predecessor items (may be aliased to @p input).
+     *   The value @p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+     *
+     * @param[out] block_prefix
+     *   The item @p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
      */
     template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Down(
-        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
-        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
-        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    __device__ __forceinline__ void Down(T (&input)[ITEMS_PER_THREAD],
+                                         T (&prev)[ITEMS_PER_THREAD],
+                                         T &block_prefix)
     {
         Down(input, prev);
         block_prefix = temp_storage[0];
diff --git a/cub/cub/block/block_store.cuh b/cub/cub/block/block_store.cuh
index 2c7c93347ea..2ed823ed8be 100644
--- a/cub/cub/block/block_store.cuh
+++ b/cub/cub/block/block_store.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Operations for writing linear segments of data from the CUDA thread block
  */
 
@@ -51,33 +51,44 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIo
+ * @addtogroup UtilIo
  * @{
  */
 
 
 /******************************************************************//**
- * \name Blocked arrangement I/O (direct)
+ * @name Blocked arrangement I/O (direct)
  *********************************************************************/
 //@{
 
 /**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ * @brief Store a blocked arrangement of items across a thread block into a linear segment of items.
  *
- * \blocked
+ * @blocked
  *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
+ *
+ * @param[in] items
+ *   Data to store
  */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
+                                                   OutputIteratorT block_itr,
+                                                   T (&items)[ITEMS_PER_THREAD])
 {
     OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
 
@@ -89,25 +100,40 @@ __device__ __forceinline__ void StoreDirectBlocked(
     }
 }
 
-
 /**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ * @brief Store a blocked arrangement of items across a
+ *        thread block into a linear segment of items, guarded by range
+ *
+ * @blocked
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
  *
- * \blocked
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
+ *
+ * @param[in] items
+ *   Data to store
+ *
+ * @param[in] valid_items
+ *   Number of valid items to write
  *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
  */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(int linear_tid,
+                                                   OutputIteratorT block_itr,
+                                                   T (&items)[ITEMS_PER_THREAD],
+                                                   int valid_items)
 {
     OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
 
@@ -122,31 +148,42 @@ __device__ __forceinline__ void StoreDirectBlocked(
     }
 }
 
-
 /**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ * @brief Store a blocked arrangement of items across a
+ *        thread block into a linear segment of items.
+ *
+ * @blocked
+ *
+ * The output offset (@p block_ptr + @p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by @p cudaMalloc()
  *
- * \blocked
+ * @par
+ * The following conditions will prevent vectorization and storing will
+ * fall back to cub::BLOCK_STORE_DIRECT:
+ *   - @p ITEMS_PER_THREAD is odd
+ *   - The data type @p T is not a built-in primitive or CUDA vector type
+ *     (e.g., \p short, \p int2, \p double, \p float2, etc.)
  *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
  *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
  *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
  *
+ * @param[in] block_ptr
+ *   Input pointer for storing from
+ *
+ * @param[in] items
+ *   Data to store
  */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreDirectBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+template <typename T, int ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(int linear_tid,
+                                                             T *block_ptr,
+                                                             T (&items)[ITEMS_PER_THREAD])
 {
     enum
     {
@@ -186,30 +223,42 @@ __device__ __forceinline__ void StoreDirectBlockedVectorized(
 
 //@}  end member group
 /******************************************************************//**
- * \name Striped arrangement I/O (direct)
+ * @name Striped arrangement I/O (direct)
  *********************************************************************/
 //@{
 
-
 /**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ * @brief Store a striped arrangement of data across the thread block into a 
+ *        linear segment of items.
+ *
+ * @striped
+ *
+ * @tparam BLOCK_THREADS
+ *   The thread block size in threads
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output @iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
  *
- * \striped
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
  *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ * @param[in] items
+ *   Data to store
  */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(int linear_tid,
+                                                   OutputIteratorT block_itr,
+                                                   T (&items)[ITEMS_PER_THREAD])
 {
     OutputIteratorT thread_itr = block_itr + linear_tid;
 
@@ -221,27 +270,42 @@ __device__ __forceinline__ void StoreDirectStriped(
     }
 }
 
-
 /**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ * @brief Store a striped arrangement of data across the thread block into
+ *        a linear segment of items, guarded by range
+ *
+ * @striped
+ *
+ * @tparam BLOCK_THREADS
+ *   The thread block size in threads
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
  *
- * \striped
+ * @param[in] items
+ *   Data to store
  *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ * @param[in] valid_items
+ *   Number of valid items to write
  */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
+template <int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(int linear_tid,
+                                                   OutputIteratorT block_itr,
+                                                   T (&items)[ITEMS_PER_THREAD],
+                                                   int valid_items)
 {
     OutputIteratorT thread_itr = block_itr + linear_tid;
 
@@ -260,31 +324,42 @@ __device__ __forceinline__ void StoreDirectStriped(
 
 //@}  end member group
 /******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
+ * @name Warp-striped arrangement I/O (direct)
  *********************************************************************/
 //@{
 
-
 /**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ * @brief Store a warp-striped arrangement of data across the
+ *        thread block into a linear segment of items.
  *
- * \warpstriped
+ * @warpstriped
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * The number of threads in the thread block must be a multiple of the architecture's warp size.
  *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
+ *
+ * @param[out] items
+ *   Data to load
  */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
+                                                       OutputIteratorT block_itr,
+                                                       T (&items)[ITEMS_PER_THREAD])
 {
     int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
     int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
@@ -300,28 +375,42 @@ __device__ __forceinline__ void StoreDirectWarpStriped(
     }
 }
 
-
 /**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ * @brief Store a warp-striped arrangement of data across the thread block into a
+ *        linear segment of items, guarded by range
  *
- * \warpstriped
+ * @warpstriped
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * The number of threads in the thread block must be a multiple of the architecture's warp size.
  *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ * @tparam T
+ *   <b>[inferred]</b> The data type to store.
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam OutputIteratorT
+ *   <b>[inferred]</b> The random-access iterator type for output \iterator.
+ *
+ * @param[in] linear_tid
+ *   A suitable 1D thread-identifier for the calling thread 
+ *   (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+ *
+ * @param[in] block_itr
+ *   The thread block's base output iterator for storing to
+ *
+ * @param[in] items
+ *   Data to store
+ *
+ * @param[in] valid_items
+ *   Number of valid items to write
  */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIteratorT>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
+template <typename T, int ITEMS_PER_THREAD, typename OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(int linear_tid,
+                                                       OutputIteratorT block_itr,
+                                                       T (&items)[ITEMS_PER_THREAD],
+                                                       int valid_items)
 {
     int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
     int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
@@ -352,59 +441,62 @@ __device__ __forceinline__ void StoreDirectWarpStriped(
 //-----------------------------------------------------------------------------
 
 /**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ * @brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a
+ *        blocked arrangement of items across a CUDA thread block to a linear segment of memory.
  */
 enum BlockStoreAlgorithm
 {
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
      * directly to memory.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) decreases as the
      *   access stride between threads increases (i.e., the number items per thread).
      */
     BLOCK_STORE_DIRECT,
 
     /**
-     * \par Overview
+     * @par Overview
      * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is written
      * directly to memory.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * The utilization of memory transactions (coalescing) remains high regardless
      * of items written per thread.
      */
     BLOCK_STORE_STRIPED,
 
     /**
-     * \par Overview
+     * @par Overview
      *
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
      * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
      * For example, <tt>st.global.v4.s32</tt> instructions will be generated
-     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     * when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0.
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high until the the
      *   access stride between threads (i.e., the number items per thread) exceeds the
      *   maximum vector store width (typically 4 items or 64B, whichever is lower).
      * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - @p ITEMS_PER_THREAD is odd
+     *   - The @p OutputIteratorT is not a simple pointer type
      *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     *   - The data type @p T is not a built-in primitive or CUDA vector type 
+     *     (e.g., @p short, @p int2, @p double, @p float2, etc.)
      */
     BLOCK_STORE_VECTORIZE,
 
     /**
-     * \par Overview
+     * @par Overview
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     * transposed and then efficiently written to memory as a 
+     * [<em>striped arrangement</em>](index.html#sec5sec3).
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items written per thread.
      * - The local reordering incurs slightly longer latencies and throughput than the
@@ -413,15 +505,15 @@ enum BlockStoreAlgorithm
     BLOCK_STORE_TRANSPOSE,
 
     /**
-     * \par Overview
+     * @par Overview
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
      * transposed and then efficiently written to memory as a
      * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
      *
-     * \par Usage Considerations
+     * @par Usage Considerations
      * - BLOCK_THREADS must be a multiple of WARP_THREADS
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items written per thread.
      * - The local reordering incurs slightly longer latencies and throughput than the
@@ -430,17 +522,17 @@ enum BlockStoreAlgorithm
     BLOCK_STORE_WARP_TRANSPOSE,
 
     /**
-     * \par Overview
+     * @par Overview
      * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
      * transposed and then efficiently written to memory as a
      * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
      * To reduce the shared memory requirement, only one warp's worth of shared
      * memory is provisioned and is subsequently time-sliced among warps.
      *
-     * \par Usage Considerations
+     * @par Usage Considerations
      * - BLOCK_THREADS must be a multiple of WARP_THREADS
      *
-     * \par Performance Considerations
+     * @par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items written per thread.
      * - Provisions less shared memory temporary storage, but incurs larger
@@ -451,19 +543,38 @@ enum BlockStoreAlgorithm
 
 
 /**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam T                    The type of data to be written.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam LEGACY_PTX_ARCH      <b>[optional]</b> Unused.
- *
- * \par Overview
+ * @brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement 
+ *        methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items 
+ *        partitioned across a CUDA thread block to a linear segment of memory.  
+ *        ![](block_store_logo.png)
+ *
+ * @ingroup BlockModule
+ *
+ * @ingroup UtilIo
+ *
+ * @tparam T                    
+ *   The type of data to be written.
+ *
+ * @tparam BLOCK_DIM_X          
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD     
+ *   The number of consecutive items partitioned onto each thread.
+ *
+ * @tparam ALGORITHM            
+ *   <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  
+ *   default: cub::BLOCK_STORE_DIRECT.
+ *
+ * @tparam BLOCK_DIM_Y          
+ *   <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ *
+ * @tparam BLOCK_DIM_Z          
+ *   <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ *
+ * @tparam LEGACY_PTX_ARCH      
+ *   <b>[optional]</b> Unused.
+ *
+ * @par Overview
  * - The BlockStore class provides a single data movement abstraction that can be specialized
  *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
  *   performance policies for different architectures, data types, granularity sizes, etc.
@@ -487,16 +598,16 @@ enum BlockStoreAlgorithm
  *      memory is provisioned and is subsequently time-sliced among warps.  [More...](\ref cub::BlockStoreAlgorithm)
  * - \rowmajor
  *
- * \par A Simple Example
+ * @par A Simple Example
  * \blockcollective{BlockStore}
- * \par
+ * @par
  * The code snippet below illustrates the storing of a "blocked" arrangement
  * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
  * meaning items are locally reordered among threads so that memory references will be
  * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
  *
  * __global__ void ExampleKernel(int *d_data, ...)
@@ -514,13 +625,13 @@ enum BlockStoreAlgorithm
  *     // Store items to linear memory
  *     BlockStore(temp_storage).Store(d_data, thread_data);
  *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
+ * @endcode
+ * @par
+ * Suppose the set of @p thread_data across the block of threads is
  * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
  *
- * \par Re-using dynamically allocating shared memory
+ * @par Re-using dynamically allocating shared memory
  * The following example under the examples/block folder illustrates usage of
  * dynamically shared memory with BlockReduce and how to re-purpose
  * the same memory region:
@@ -580,21 +691,38 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory
+        /**
+         * @brief Store items into a linear segment of memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             StoreDirectBlocked(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
@@ -621,21 +749,38 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory
+        /**
+         * @brief Store items into a linear segment of memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
         }
@@ -662,29 +807,54 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        /**
+         * @brief Store items into a linear segment of memory,
+         *        specialized for native pointer types (attempts vectorization)
+         *
+         * @param[in] block_ptr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
+        __device__ __forceinline__ void Store(T *block_ptr, T (&items)[ITEMS_PER_THREAD])
         {
             StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
         }
 
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        /**
+         * @brief Store items into a linear segment of memory,
+         *        specialized for opaque input iterators (skips vectorization)
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             StoreDirectBlocked(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
@@ -725,26 +895,47 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory
+        /**
+         * @brief Store items into a linear segment of memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             BlockExchange(temp_storage).BlockedToStriped(items);
             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             BlockExchange(temp_storage).BlockedToStriped(items);
             if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            {
+              // Move through volatile smem as a workaround to prevent RF spilling on 
+              // subsequent loads
+              temp_storage.valid_items = valid_items;     
+            }
             CTA_SYNC();
             StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
         }
@@ -793,26 +984,47 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory
+        /**
+         * @brief Store items into a linear segment of memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             BlockExchange(temp_storage).BlockedToWarpStriped(items);
             StoreDirectWarpStriped(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
-            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
-            int               valid_items)                  ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             BlockExchange(temp_storage).BlockedToWarpStriped(items);
             if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            {
+              // Move through volatile smem as a workaround to prevent RF spilling on 
+              // subsequent loads
+              temp_storage.valid_items = valid_items;
+            }
             CTA_SYNC();
             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
         }
@@ -861,26 +1073,47 @@ private:
             linear_tid(linear_tid)
         {}
 
-        /// Store items into a linear segment of memory
+        /**
+         * @brief Store items into a linear segment of memory
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD])
         {
             BlockExchange(temp_storage).BlockedToWarpStriped(items);
             StoreDirectWarpStriped(linear_tid, block_itr, items);
         }
 
-        /// Store items into a linear segment of memory, guarded by range
+        /**
+         * @brief Store items into a linear segment of memory, guarded by range
+         *
+         * @param[in] block_itr
+         *   The thread block's base output iterator for storing to
+         *
+         * @param[in] items
+         *   Data to store
+         *
+         * @param[in] valid_items
+         *   Number of valid items to write
+         */
         template <typename OutputIteratorT>
-        __device__ __forceinline__ void Store(
-            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
+        __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                              T (&items)[ITEMS_PER_THREAD],
+                                              int valid_items)
         {
             BlockExchange(temp_storage).BlockedToWarpStriped(items);
             if (linear_tid == 0)
-                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            {
+              // Move through volatile smem as a workaround to prevent RF spilling on
+              // subsequent loads
+              temp_storage.valid_items = valid_items;
+            }
             CTA_SYNC();
             StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
         }
@@ -924,17 +1157,17 @@ private:
 public:
 
 
-    /// \smemstorage{BlockStore}
+    /// @smemstorage{BlockStore}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
     /******************************************************************//**
-     * \name Collective constructors
+     * @name Collective constructors
      *********************************************************************/
     //@{
 
     /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     * @brief Collective constructor using a private static allocation of shared memory as temporary storage.
      */
     __device__ __forceinline__ BlockStore()
     :
@@ -942,40 +1175,39 @@ public:
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
     /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     * @brief Collective constructor using the specified memory allocation as temporary storage.
+     *
+     * @param temp_storage[in]
+     *   Reference to memory allocation having layout type TempStorage
      */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    __device__ __forceinline__ BlockStore(TempStorage &temp_storage)
+        : temp_storage(temp_storage.Alias())
+        , linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
 
     //@}  end member group
     /******************************************************************//**
-     * \name Data movement
+     * @name Data movement
      *********************************************************************/
     //@{
 
-
     /**
-     * \brief Store items into a linear segment of memory.
+     * @brief Store items into a linear segment of memory.
      *
-     * \par
-     * - \blocked
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the storing of a "blocked" arrangement
      * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
      * meaning items are locally reordered among threads so that memory references will be
      * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, ...)
@@ -994,36 +1226,40 @@ public:
      *     int thread_data[4];
      *     BlockStore(temp_storage).Store(d_data, thread_data);
      *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
+     * @endcode
+     * @par
+     * Suppose the set of @p thread_data across the block of threads is
      * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The output @p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     * @param block_itr[out]
+     *   The thread block's base output iterator for storing to
+     *
+     * @param items[in]
+     *   Data to store
      *
      */
     template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [out] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD])
     {
         InternalStore(temp_storage, linear_tid).Store(block_itr, items);
     }
 
     /**
-     * \brief Store items into a linear segment of memory, guarded by range.
+     * @brief Store items into a linear segment of memory, guarded by range.
      *
-     * \par
-     * - \blocked
-     * - \smemreuse
+     * @par
+     * - @blocked
+     * - @smemreuse
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates the guarded storing of a "blocked" arrangement
      * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * into a linear segment of memory. The store is specialized for @p BLOCK_STORE_WARP_TRANSPOSE,
      * meaning items are locally reordered among threads so that memory references will be
      * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
      *
      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
@@ -1042,19 +1278,26 @@ public:
      *     int thread_data[4];
      *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
      *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * @endcode
+     * @par
+     * Suppose the set of @p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and @p valid_items is @p 5.
+     * The output @p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
      * only the first two threads being unmasked to store portions of valid data.
      *
+     * @param block_itr[out]
+     *   The thread block's base output iterator for storing to
+     *
+     * @param items[in]
+     *   Data to store
+     *
+     * @param valid_items[in]
+     *   Number of valid items to write
      */
     template <typename OutputIteratorT>
-    __device__ __forceinline__ void Store(
-        OutputIteratorT     block_itr,                  ///< [out] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
+    __device__ __forceinline__ void Store(OutputIteratorT block_itr,
+                                          T (&items)[ITEMS_PER_THREAD],
+                                          int valid_items)
     {
         InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
     }
diff --git a/cub/cub/block/specializations/block_histogram_atomic.cuh b/cub/cub/block/specializations/block_histogram_atomic.cuh
index 367599b3dc2..3360ad0ed55 100644
--- a/cub/cub/block/specializations/block_histogram_atomic.cuh
+++ b/cub/cub/block/specializations/block_histogram_atomic.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
+ * histograms from data samples partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -43,9 +44,9 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide
+ *        histograms from data samples partitioned across a CUDA thread block.
  */
 template <int BINS>
 struct BlockHistogramAtomic
@@ -59,24 +60,26 @@ struct BlockHistogramAtomic
         TempStorage &temp_storage)
     {}
 
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            T,
-        typename            CounterT,
-        int                 ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    /**
+     * @brief Composite data onto an existing histogram
+     *
+     * @param[in] items
+     *   Calling thread's input values to histogram
+     *
+     * @param[out] histogram
+     *   Reference to shared/device-accessible memory histogram
+     */
+    template <typename T, typename CounterT, int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD],
+                                              CounterT histogram[BINS])
     {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
+      // Update histogram
+      #pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        atomicAdd(histogram + items[i], 1);
+      }
     }
-
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_histogram_sort.cuh b/cub/cub/block/specializations/block_histogram_sort.cuh
index 4ae46fc9af4..38dc70315a0 100644
--- a/cub/cub/block/specializations/block_histogram_sort.cuh
+++ b/cub/cub/block/specializations/block_histogram_sort.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide
+ * histograms from data samples partitioned across a CUDA thread block.
  */
 
 #pragma once
@@ -47,19 +48,38 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
-
 /**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ * @brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide
+ *        histograms from data samples partitioned across a CUDA thread block.
+ *
+ * @tparam T
+ *   Sample type
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam ITEMS_PER_THREAD
+ *   The number of samples per thread
+ *
+ * @tparam BINS
+ *   The number of bins into which histogram samples may fall
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective (unused)
  */
-template <
-    typename    T,                  ///< Sample type
-    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
-    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
-    int         BINS,               ///< The number of bins into which histogram samples may fall
-    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective (unused)
+template <typename T,
+          int BLOCK_DIM_X,
+          int ITEMS_PER_THREAD,
+          int BINS,
+          int BLOCK_DIM_Y,
+          int BLOCK_DIM_Z,
+          int LEGACY_PTX_ARCH = 0>
 struct BlockHistogramSort
 {
     /// Constants
@@ -156,13 +176,18 @@ struct BlockHistogramSort
         }
     };
 
-
-    // Composite data onto an existing histogram
-    template <
-        typename            CounterT     >
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    /**
+     * @brief Composite data onto an existing histogram
+     *
+     * @param[in] items
+     *   Calling thread's input values to histogram
+     *
+     * @param[out] histogram
+     *   Reference to shared/device-accessible memory histogram
+     */
+    template <typename CounterT>
+    __device__ __forceinline__ void Composite(T (&items)[ITEMS_PER_THREAD],
+                                              CounterT histogram[BINS])
     {
         enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
 
diff --git a/cub/cub/block/specializations/block_reduce_raking.cuh b/cub/cub/block/specializations/block_reduce_raking.cuh
index 98ab45c794d..f8db9326fdb 100644
--- a/cub/cub/block/specializations/block_reduce_raking.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ * @file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
+ * block.  Supports non-commutative reduction operators.
  */
 
 #pragma once
@@ -48,9 +49,9 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ * @brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread
+ *        block. Supports non-commutative reduction operators.
  *
  * Supports non-commutative binary reduction operators.  Unlike commutative
  * reduction operators (e.g., addition), the application of a non-commutative
@@ -61,13 +62,23 @@ CUB_NAMESPACE_BEGIN
  * Compared to the implementation of BlockReduceRakingCommutativeOnly (which
  * does not support non-commutative operators), this implementation requires a
  * few extra rounds of inter-thread communication.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
 struct BlockReduceRaking
 {
     /// Constants
@@ -103,14 +114,15 @@ struct BlockReduceRaking
 
     };
 
-
     /// Shared memory storage layout type
     union _TempStorage
     {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
-    };
+      /// Storage for warp-synchronous reduction
+      typename WarpReduce::TempStorage warp_storage;
 
+      /// Padded thread block raking grid
+      typename BlockRakingLayout::TempStorage raking_grid;
+    };
 
     /// Alias wrapper allowing storage to be unioned
     struct TempStorage : Uninitialized<_TempStorage> {};
@@ -129,14 +141,22 @@ struct BlockReduceRaking
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
+    /**
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] partial
+     *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary reduction operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<ITERATION>         /*iteration*/)
+    __device__ __forceinline__ T RakingReduction(ReductionOp reduction_op,
+                                                 T *raking_segment,
+                                                 T partial,
+                                                 int num_valid,
+                                                 Int2Type<ITERATION> /*iteration*/)
     {
         // Update partial if addend is in range
         if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
@@ -147,27 +167,42 @@ struct BlockReduceRaking
         return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
     }
 
+    /**
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] partial
+     *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool IS_FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 /*reduction_op*/,   ///< [in] Binary reduction operator
-        T                           * /*raking_segment*/,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    __device__ __forceinline__ T RakingReduction(ReductionOp /*reduction_op*/,
+                                                 T * /*raking_segment*/,
+                                                 T partial,
+                                                 int /*num_valid*/,
+                                                 Int2Type<SEGMENT_LENGTH> /*iteration*/)
     {
         return partial;
     }
 
-
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                IS_FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    /**
+     * @brief Computes a thread block-wide reduction using the specified reduction operator. The
+     *        first num_valid threads each contribute one reduction partial. The return value is
+     *        only valid for thread<sub>0</sub>.
+     *
+     * @param[in] partial
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -208,20 +243,24 @@ struct BlockReduceRaking
         return partial;
     }
 
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    /**
+     * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+     *        The first num_valid threads each contribute one reduction partial. The return value is
+     *        only valid for thread<sub>0</sub>.
+     *
+     * @param[in] partial
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool IS_FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    __device__ __forceinline__ T Sum(T partial, int num_valid)
     {
         cub::Sum reduction_op;
 
         return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
     }
-
-
-
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
index f4178f31edd..34a4e3ff236 100644
--- a/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ b/cub/cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ * @file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across
+ * a CUDA thread block.  Does not support non-commutative reduction operators.
  */
 
 #pragma once
@@ -48,16 +49,27 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ * @brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction
+ *        across a CUDA thread block. Does not support non-commutative reduction operators. Does not
+ *        support block sizes that are not a multiple of the warp size.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
 struct BlockReduceRakingCommutativeOnly
 {
     /// Constants
@@ -98,15 +110,18 @@ struct BlockReduceRakingCommutativeOnly
     /// Shared memory storage layout type
     union _TempStorage
     {
-        struct DefaultStorage
-        {
-            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
-            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
-        } default_storage;
+      struct DefaultStorage
+      {
+        /// Storage for warp-synchronous reduction
+        typename WarpReduce::TempStorage warp_storage;
 
-        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block reduction
-    };
+        /// Padded thread block raking grid
+        typename BlockRakingLayout::TempStorage raking_grid;
+      } default_storage;
 
+      /// Fall-back storage for non-commutative block reduction
+      typename FallBack::TempStorage fallback_storage;
+    };
 
     /// Alias wrapper allowing storage to be unioned
     struct TempStorage : Uninitialized<_TempStorage> {};
@@ -125,12 +140,19 @@ struct BlockReduceRakingCommutativeOnly
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
     {}
 
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    /**
+     * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+     *        The first num_valid threads each contribute one reduction partial.
+     *        The return value is only valid for thread<sub>0</sub>.
+     *
+     * @param[in] partial
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    __device__ __forceinline__ T Sum(T partial, int num_valid)
     {
         if (USE_FALLBACK || !FULL_TILE)
         {
@@ -159,15 +181,22 @@ struct BlockReduceRakingCommutativeOnly
         return partial;
     }
 
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    /**
+     * @brief Computes a thread block-wide reduction using the specified reduction operator.
+     *        The first num_valid threads each contribute one reduction partial.
+     *        The return value is only valid for thread<sub>0</sub>.
+     *
+     * @param[in] partial
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T partial, int num_valid, ReductionOp reduction_op)
     {
         if (USE_FALLBACK || !FULL_TILE)
         {
diff --git a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
index 3b68a283bcb..b9653366b00 100644
--- a/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ b/cub/cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ * @file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
+ * across a CUDA thread block. Supports non-commutative reduction operators.
  */
 
 #pragma once
@@ -47,16 +48,25 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ * @brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction
+ *        across a CUDA thread block. Supports non-commutative reduction operators.
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
 struct BlockReduceWarpReductions
 {
     /// Constants
@@ -82,13 +92,17 @@ struct BlockReduceWarpReductions
     ///  WarpReduce utility type
     typedef typename WarpReduce<T, LOGICAL_WARP_SIZE>::InternalWarpReduce WarpReduce;
 
-
     /// Shared memory storage layout type
     struct _TempStorage
     {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous reduction
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous reduction
-        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+      /// Buffer for warp-synchronous reduction
+      typename WarpReduce::TempStorage warp_reduce[WARPS];
+
+      /// Shared totals from each warp-synchronous reduction
+      T warp_aggregates[WARPS];
+
+      /// Shared prefix for the entire thread block
+      T block_prefix;
     };
 
     /// Alias wrapper allowing storage to be unioned
@@ -112,13 +126,21 @@ struct BlockReduceWarpReductions
         lane_id(LaneId())
     {}
 
-
+    /**
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] warp_aggregate
+     *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary reduction operator
-        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op,
+                                                     T warp_aggregate,
+                                                     int num_valid,
+                                                     Int2Type<SUCCESSOR_WARP> /*successor_warp*/)
     {
         if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
         {
@@ -128,25 +150,41 @@ struct BlockReduceWarpReductions
         return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
     }
 
+    /**
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] warp_aggregate
+     *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         /*reduction_op*/,   ///< [in] Binary reduction operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<WARPS>     /*successor_warp*/)
+    __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp /*reduction_op*/,
+                                                     T warp_aggregate,
+                                                     int /*num_valid*/,
+                                                     Int2Type<WARPS> /*successor_warp*/)
     {
         return warp_aggregate;
     }
 
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    /**
+     * @brief Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] warp_aggregate
+     *   <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(ReductionOp reduction_op,
+                                                     T warp_aggregate,
+                                                     int num_valid)
     {
         // Share lane aggregates
         if (lane_id == 0)
@@ -166,12 +204,19 @@ struct BlockReduceWarpReductions
         return warp_aggregate;
     }
 
-
-    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    /**
+     * @brief Computes a thread block-wide reduction using addition (+) as the reduction operator.
+     *        The first num_valid threads each contribute one reduction partial. The return value is
+     *        only valid for thread<sub>0</sub>.
+     *
+     * @param[in] input
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     */
     template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    __device__ __forceinline__ T Sum(T input, int num_valid)
     {
         cub::Sum    reduction_op;
         int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
@@ -189,15 +234,22 @@ struct BlockReduceWarpReductions
         return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
     }
 
-
-    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    /**
+     * @brief Computes a thread block-wide reduction using the specified reduction operator.
+     *        The first num_valid threads each contribute one reduction partial.
+     *        The return value is only valid for thread<sub>0</sub>.
+     *
+     * @param[in] input
+     *   Calling thread's input partial reductions
+     *
+     * @param[in] num_valid
+     *   Number of valid elements (may be less than BLOCK_THREADS)
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T input, int num_valid, ReductionOp reduction_op)
     {
         int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
         int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
diff --git a/cub/cub/block/specializations/block_scan_raking.cuh b/cub/cub/block/specializations/block_scan_raking.cuh
index b1aad44e04e..e3e57aa0107 100644
--- a/cub/cub/block/specializations/block_scan_raking.cuh
+++ b/cub/cub/block/specializations/block_scan_raking.cuh
@@ -26,10 +26,10 @@
  *
  ******************************************************************************/
 
-
 /**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ * @file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a
+ * CUDA thread block.
  */
 
 #pragma once
@@ -51,17 +51,35 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 
 CUB_NAMESPACE_BEGIN
 
-
 /**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ * @brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA
+ * thread block.
+ *
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam MEMOIZE
+ *   Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the
+ * expense of higher register pressure
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,              ///< Data type being scanned
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
+template <typename T,
+          int BLOCK_DIM_X,
+          int BLOCK_DIM_Y,
+          int BLOCK_DIM_Z,
+          bool MEMOIZE,
+          int LEGACY_PTX_ARCH = 0>
 struct BlockScanRaking
 {
     //---------------------------------------------------------------------
@@ -97,9 +115,14 @@ struct BlockScanRaking
     /// Shared memory storage layout type
     struct _TempStorage
     {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
-        T                                           block_aggregate;    ///< Block aggregate
+      /// Buffer for warp-synchronous scan
+      typename WarpScan::TempStorage warp_scan;
+
+      /// Padded thread block raking grid
+      typename BlockRakingLayout::TempStorage raking_grid;
+
+      /// Block aggregate
+      T block_aggregate;
     };
 
 
@@ -121,13 +144,23 @@ struct BlockScanRaking
     // Utility methods
     //---------------------------------------------------------------------
 
-    /// Templated reduction
+    /**
+     * @brief Templated reduction
+     *
+     * @param[in] raking_ptr
+     *   Input array
+     *
+     * @param[in] scan_op
+     *   Binary reduction operator
+     *
+     * @param[in] raking_partial
+     *   Prefix to seed reduction with
+     */
     template <int ITERATION, typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                  raking_ptr,         ///< [in] Input array
-        ScanOp              scan_op,            ///< [in] Binary reduction operator
-        T                   raking_partial,     ///< [in] Prefix to seed reduction with
-        Int2Type<ITERATION> /*iteration*/)
+    __device__ __forceinline__ T GuardedReduce(T *raking_ptr,
+                                               ScanOp scan_op,
+                                               T raking_partial,
+                                               Int2Type<ITERATION> /*iteration*/)
     {
         if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
         {
@@ -138,39 +171,57 @@ struct BlockScanRaking
         return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
     }
 
-
-    /// Templated reduction (base case)
+    /**
+     * @brief Templated reduction (base case)
+     *
+     * @param[in] raking_ptr
+     *   Input array
+     *
+     * @param[in] scan_op
+     *   Binary reduction operator
+     *
+     * @param[in] raking_partial
+     *   Prefix to seed reduction with
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                          /*raking_ptr*/,    ///< [in] Input array
-        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
-        T                           raking_partial,    ///< [in] Prefix to seed reduction with
-        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    __device__ __forceinline__ T GuardedReduce(T * /*raking_ptr*/,
+                                               ScanOp /*scan_op*/,
+                                               T raking_partial,
+                                               Int2Type<SEGMENT_LENGTH> /*iteration*/)
     {
         return raking_partial;
     }
 
-
-    /// Templated copy
+    /**
+     * @brief Templated copy
+     *
+     * @param out
+     *   [out] Out array
+     *
+     * @param in
+     *   [in] Input array
+     */
     template <int ITERATION>
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<ITERATION> /*iteration*/)
+    __device__ __forceinline__ void CopySegment(T *out, T *in, Int2Type<ITERATION> /*iteration*/)
     {
         out[ITERATION] = in[ITERATION];
         CopySegment(out, in, Int2Type<ITERATION + 1>());
     }
 
-
-    /// Templated copy (base case)
-    __device__ __forceinline__ void CopySegment(
-        T*                  /*out*/,            ///< [out] Out array
-        T*                  /*in*/,             ///< [in] Input array
-        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    /**
+     * @brief Templated copy (base case)
+     *
+     * @param[out] out
+     *   Out array
+     *
+     * @param[in] in
+     *   Input array
+     */
+    __device__ __forceinline__ void CopySegment(T * /*out*/,
+                                                T * /*in*/,
+                                                Int2Type<SEGMENT_LENGTH> /*iteration*/)
     {}
 
-
     /// Performs upsweep raking reduction, returning the aggregate
     template <typename ScanOp>
     __device__ __forceinline__ T Upsweep(
@@ -248,12 +299,22 @@ struct BlockScanRaking
     // Exclusive scans
     //---------------------------------------------------------------------
 
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element. With no initial value,
+     *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    __device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -289,13 +350,25 @@ struct BlockScanRaking
         }
     }
 
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     * scan_op functor.  Each thread contributes one input element.
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -331,14 +404,27 @@ struct BlockScanRaking
         }
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs.  With no initial value,
+     *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -382,15 +468,29 @@ struct BlockScanRaking
         }
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &output, const T &initial_value, ScanOp scan_op, T &block_aggregate)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -433,16 +533,32 @@ struct BlockScanRaking
         }
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+     *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+     *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+     *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+     *        the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+     *   block-wide prefix to be applied to all inputs.
+     */
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(T input,
+                                                  T &output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -504,12 +620,21 @@ struct BlockScanRaking
     // Inclusive scans
     //---------------------------------------------------------------------
 
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    __device__ __forceinline__ void InclusiveScan(T input, T &output, ScanOp scan_op)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -545,14 +670,26 @@ struct BlockScanRaking
         }
     }
 
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element.  Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &output, ScanOp scan_op, T &block_aggregate)
     {
         if (WARP_SYNCHRONOUS)
         {
@@ -596,16 +733,32 @@ struct BlockScanRaking
         }
     }
 
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+     *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+     *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+     *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+     *        the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+     *   block-wide prefix to be applied to all inputs.
+     */
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(T input,
+                                                  T &output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         if (WARP_SYNCHRONOUS)
         {
diff --git a/cub/cub/block/specializations/block_scan_warp_scans.cuh b/cub/cub/block/specializations/block_scan_warp_scans.cuh
index f48b6dec0a4..0eef68780d0 100644
--- a/cub/cub/block/specializations/block_scan_warp_scans.cuh
+++ b/cub/cub/block/specializations/block_scan_warp_scans.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
  */
 
@@ -48,14 +48,22 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ * @brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA
+ *        thread block.
+ *
+ * @tparam BLOCK_DIM_X
+ *   The thread block length in threads along the X dimension
+ *
+ * @tparam BLOCK_DIM_Y
+ *   The thread block length in threads along the Y dimension
+ *
+ * @tparam BLOCK_DIM_Z
+ *   The thread block length in threads along the Z dimension
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         LEGACY_PTX_ARCH = 0> ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int LEGACY_PTX_ARCH = 0>
 struct BlockScanWarpScans
 {
     //---------------------------------------------------------------------
@@ -85,9 +93,13 @@ struct BlockScanWarpScans
 
     struct __align__(32) _TempStorage
     {
-        T                               warp_aggregates[WARPS];
-        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                               block_prefix;               ///< Shared prefix for the entire thread block
+      T warp_aggregates[WARPS];
+
+      /// Buffer for warp-synchronous scans
+      typename WarpScanT::TempStorage warp_scan[WARPS];
+
+      /// Shared prefix for the entire thread block
+      T block_prefix;
     };
 
 
@@ -125,12 +137,21 @@ struct BlockScanWarpScans
     // Utility methods
     //---------------------------------------------------------------------
 
+    /**
+     * @param[out] warp_prefix
+     *   The calling thread's partial reduction
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARP>  /*addend_warp*/)
+    __device__ __forceinline__ void ApplyWarpAggregates(T &warp_prefix,
+                                                        ScanOp scan_op,
+                                                        T &block_aggregate,
+                                                        Int2Type<WARP> /*addend_warp*/)
     {
         if (warp_id == WARP)
             warp_prefix = block_aggregate;
@@ -141,21 +162,41 @@ struct BlockScanWarpScans
         ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
     }
 
+    /**
+     * @param[out] warp_prefix
+     *   The calling thread's partial reduction
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregat
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
-        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
-        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
-        Int2Type<WARPS> /*addend_warp*/)
+    __device__ __forceinline__ void ApplyWarpAggregates(T & /*warp_prefix*/,
+                                                        ScanOp /*scan_op*/,
+                                                        T & /*block_aggregate*/,
+                                                        Int2Type<WARPS> /*addend_warp*/)
     {}
 
-
-    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    /**
+     * @brief Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns
+     *        block-wide aggregate in all threads.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] warp_aggregate
+     *   <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of
+     *   input items
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ T ComputeWarpPrefix(ScanOp scan_op,
+                                                   T warp_aggregate,
+                                                   T &block_aggregate)
     {
         // Last lane in each warp shares its warp-aggregate
         if (lane_id == WARP_THREADS - 1)
@@ -187,14 +228,26 @@ struct BlockScanWarpScans
         return warp_prefix;
     }
 
-
-    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    /**
+     * @brief Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.
+     *        Also returns block-wide aggregate in all threads.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] warp_aggregate
+     *   <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of
+     * input items
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ T ComputeWarpPrefix(
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    __device__ __forceinline__ T
+    ComputeWarpPrefix(ScanOp scan_op, T warp_aggregate, T &block_aggregate, const T &initial_value)
     {
         T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
 
@@ -210,39 +263,73 @@ struct BlockScanWarpScans
     // Exclusive scans
     //---------------------------------------------------------------------
 
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  With no initial value,
+     *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    __device__ __forceinline__ void ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op)
     {
         // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
         T block_aggregate;
         ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op)            ///< [in] Binary scan operator
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &exclusive_output, const T &initial_value, ScanOp scan_op)
     {
         T block_aggregate;
         ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element.  Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs. With no initial value,
+     *        the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item
-        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    ExclusiveScan(T input, T &exclusive_output, ScanOp scan_op, T &block_aggregate)
     {
         // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
         T inclusive_output;
@@ -260,15 +347,32 @@ struct BlockScanWarpScans
         }
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input items
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output items (may be aliased to \p input)
+     *
+     * @param[in] initial_value
+     *   Initial value to seed the exclusive scan
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void ExclusiveScan(T input,
+                                                  T &exclusive_output,
+                                                  const T &initial_value,
+                                                  ScanOp scan_op,
+                                                  T &block_aggregate)
     {
         // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
         T inclusive_output;
@@ -283,16 +387,32 @@ struct BlockScanWarpScans
             exclusive_output = warp_prefix;
     }
 
-
-    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    /**
+     * @brief Computes an exclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.  the call-back functor \p
+     *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+     *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+     *        logically prefixes the thread block's scan inputs.  Also provides every thread with
+     *        the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+     *   block-wide prefix to be applied to all inputs.
+     */
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(T input,
+                                                  T &exclusive_output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
         T block_aggregate;
@@ -327,25 +447,46 @@ struct BlockScanWarpScans
     // Inclusive scans
     //---------------------------------------------------------------------
 
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor.  Each thread contributes one input element.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    __device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
     {
         T block_aggregate;
         InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
     }
 
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element. Also provides every
+     *        thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] block_aggregate
+     *   Threadblock-wide aggregate reduction of input items
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &block_aggregate)
     {
         WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
 
@@ -359,16 +500,32 @@ struct BlockScanWarpScans
         }
     }
 
-
-    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    /**
+     * @brief Computes an inclusive thread block-wide prefix scan using the specified binary \p
+     *        scan_op functor. Each thread contributes one input element. the call-back functor \p
+     *        block_prefix_callback_op is invoked by the first warp in the block, and the value
+     *        returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that
+     *        logically prefixes the thread block's scan inputs. Also provides every thread with
+     *        the block-wide \p block_aggregate of all inputs.
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] exclusive_output
+     *   Calling thread's output item (may be aliased to \p input)
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in-out] block_prefix_callback_op
+     *   <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread
+     * block-wide prefix to be applied to all inputs.
+     */
+    template <typename ScanOp, typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(T input,
+                                                  T &exclusive_output,
+                                                  ScanOp scan_op,
+                                                  BlockPrefixCallbackOp &block_prefix_callback_op)
     {
         T block_aggregate;
         InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
diff --git a/cub/cub/device/device_spmv.cuh b/cub/cub/device/device_spmv.cuh
index ac6fd123142..b68c70d842d 100644
--- a/cub/cub/device/device_spmv.cuh
+++ b/cub/cub/device/device_spmv.cuh
@@ -28,8 +28,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ * @file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector
+ * multiplication (SpMV).
  */
 
 #pragma once
@@ -54,10 +55,12 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
- * \ingroup SingleModule
+ * @brief DeviceSpmv provides device-wide parallel operations for performing 
+ *        sparse-matrix * dense-vector multiplication (SpMV).
  *
- * \par Overview
+ * @ingroup SingleModule
+ *
+ * @par Overview
  * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
  * performs the matrix-vector operation
  * <em>y</em> = <b>A</b>*<em>x</em> + <em>y</em>,
@@ -67,29 +70,31 @@ CUB_NAMESPACE_BEGIN
  *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
  *  - <em>x</em> and <em>y</em> are dense vectors
  *
- * \par Usage Considerations
- * \cdp_class{DeviceSpmv}
+ * @par Usage Considerations
+ * @cdp_class{DeviceSpmv}
  *
  */
 struct DeviceSpmv
 {
     /******************************************************************//**
-     * \name CSR matrix operations
+     * @name CSR matrix operations
      *********************************************************************/
     //@{
 
     /**
-     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     * @brief This function performs the matrix-vector operation
+     *        <em>y</em> = <b>A</b>*<em>x</em>.
      *
-     * \par Snippet
+     * @par Snippet
      * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
      * representing a 3x3 lattice (24 non-zeros).
      *
-     * \par
-     * \code
+     * @par
+     * @code
      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
      *
-     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input
+     * vector x,
      * // and output vector y
      * int    num_rows = 9;
      * int    num_cols = 9;
@@ -126,25 +131,63 @@ struct DeviceSpmv
      *
      * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
      *
-     * \endcode
+     * @endcode
+     *
+     * @tparam ValueT
+     *   <b>[inferred]</b> Matrix and vector value type (e.g., @p float, @p double, etc.)
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to @p temp_storage_bytes
+     *   and no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_values
+     *   Pointer to the array of @p num_nonzeros values of the corresponding nonzero elements
+     *   of matrix <b>A</b>.
+     *
+     * @param[in] d_row_offsets
+     *   Pointer to the array of @p m + 1 offsets demarcating the start of every row in
+     *   @p d_column_indices and @p d_values (with the final entry being equal to @p num_nonzeros)
+     *
+     * @param[in] d_column_indices
+     *   Pointer to the array of @p num_nonzeros column-indices of the corresponding nonzero
+     *   elements of matrix <b>A</b>.  (Indices are zero-valued.)
+     *
+     * @param[in] d_vector_x
+     *   Pointer to the array of @p num_cols values corresponding to the dense input vector
+     * <em>x</em>
+     *
+     * @param[out] d_vector_y
+     *   Pointer to the array of @p num_rows values corresponding to the dense output vector
+     * <em>y</em>
+     *
+     * @param[in] num_rows
+     *   number of rows of matrix <b>A</b>.
+     *
+     * @param[in] num_cols
+     *   number of columns of matrix <b>A</b>.
+     *
+     * @param[in] num_nonzeros
+     *   number of nonzero elements of matrix <b>A</b>.
      *
-     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     * @param[in] stream
+     *   <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
      */
-    template <
-        typename            ValueT>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t CsrMV(
-        void*               d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        const ValueT*       d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
-        const int*          d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
-        const int*          d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
-        const ValueT*       d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
-        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
-        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
-        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
-        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
-        cudaStream_t        stream = 0)                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    template <typename ValueT>
+    CUB_RUNTIME_FUNCTION static cudaError_t CsrMV(void *d_temp_storage,
+                                                  size_t &temp_storage_bytes,
+                                                  const ValueT *d_values,
+                                                  const int *d_row_offsets,
+                                                  const int *d_column_indices,
+                                                  const ValueT *d_vector_x,
+                                                  ValueT *d_vector_y,
+                                                  int num_rows,
+                                                  int num_cols,
+                                                  int num_nonzeros,
+                                                  cudaStream_t stream = 0)
     {
         SpmvParams<ValueT, int> spmv_params;
         spmv_params.d_values             = d_values;
diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
index aef8df7a8da..ae72a22d748 100644
--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ * @file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across
+ * a sequence of data items residing within device-accessible memory.
  */
 
 #pragma once
@@ -74,26 +75,59 @@ CUB_NAMESPACE_BEGIN
  *****************************************************************************/
 
 /**
- * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ * @brief Upsweep digit-counting kernel entry point (multi-block).
+ *        Computes privatized digit histograms, one per block.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys
+ *   Input keys buffer
+ *
+ * @param[out] d_spine
+ *   Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] num_bits
+ *   Number of bits of current radix digit
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapan equal number of tiles onto each thread block
  */
-template <
-    typename ChainedPolicyT,                 ///< Chained tuning policy
-    bool     ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool     IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,                           ///< Key type
-    typename OffsetT,                        ///< Signed integer type for global offsets
-    typename DecomposerT = detail::identity_decomposer_t>
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) :
-    int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
-    const KeyT              *d_keys,                        ///< [in] Input keys buffer
-    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share,                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-    DecomposerT             decomposer = {})
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS)
+                        ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS)
+                        : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)))
+  CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(const KeyT *d_keys,
+                                                                 OffsetT *d_spine,
+                                                                 OffsetT /*num_items*/,
+                                                                 int current_bit,
+                                                                 int num_bits,
+                                                                 GridEvenShare<OffsetT> even_share,
+                                                                 DecomposerT decomposer = {})
 {
     using ActiveUpsweepPolicyT =
       cub::detail::conditional_t<
@@ -137,17 +171,27 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel(
     upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
 }
 
-
 /**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ * @brief Spine scan kernel entry point (single-block).
+ *        Computes an exclusive prefix sum over the privatized digit histograms
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in,out] d_spine
+ *   Privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_counts
+ *   Total number of bin-counts
  */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    typename                OffsetT>                        ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
-CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
-    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int                     num_counts)                     ///< [in] Total number of bin-counts
+template <typename ChainedPolicyT, typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS),
+                  1) CUB_DETAIL_KERNEL_ATTRIBUTES
+  void RadixSortScanBinsKernel(OffsetT *d_spine, int num_counts)
 {
     // Parameterize the AgentScan type for the current configuration
     typedef AgentScan<
@@ -183,32 +227,77 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel(
     }
 }
 
-
 /**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ * @brief Downsweep pass kernel entry point (multi-block).
+ *        Scatters keys (and values) into corresponding bins for the current digit place.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] d_spine
+ *   Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block,
+ *   then 1s counts from each block, etc.)
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] num_bits
+ *   Number of bits of current radix digit
+ *
+ * @param[in] even_share
+ *   Even-share descriptor for mapan equal number of tiles onto each thread block
  */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT,                        ///< Signed integer type for global offsets
-    typename                DecomposerT = detail::identity_decomposer_t>
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) :
-    int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     num_bits,                       ///< [in] Number of bits of current radix digit
-    GridEvenShare<OffsetT>  even_share,                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
-    DecomposerT             decomposer = {})
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS)
+                        ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS)
+                        : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)))
+  CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceRadixSortDownsweepKernel(const KeyT *d_keys_in,
+                                      KeyT *d_keys_out,
+                                      const ValueT *d_values_in,
+                                      ValueT *d_values_out,
+                                      OffsetT *d_spine,
+                                      OffsetT num_items,
+                                      int current_bit,
+                                      int num_bits,
+                                      GridEvenShare<OffsetT> even_share,
+                                      DecomposerT decomposer = {})
 {
     using ActiveUpsweepPolicyT =
       cub::detail::conditional_t<
@@ -250,27 +339,62 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel(
         even_share.block_end);
 }
 
-
 /**
- * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ * @brief Single pass kernel entry point (single-block).
+ *        Fully sorts a tile of input.
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] num_items
+ *   Total number of input data items
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] end_bit
+ *   The past-the-end (most-significant) bit index needed for key comparison
  */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                OffsetT,                        ///< Signed integer type for global offsets
-    typename                DecomposerT = detail::identity_decomposer_t>
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    OffsetT                 num_items,                      ///< [in] Total number of input data items
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    DecomposerT             decomposer = {})
+template <typename ChainedPolicyT,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS),
+                  1) CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceRadixSortSingleTileKernel(const KeyT *d_keys_in,
+                                       KeyT *d_keys_out,
+                                       const ValueT *d_values_in,
+                                       ValueT *d_values_out,
+                                       OffsetT num_items,
+                                       int current_bit,
+                                       int end_bit,
+                                       DecomposerT decomposer = {})
 {
     // Constants
     enum
@@ -370,34 +494,89 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel(
     }
 }
 
-
 /**
- * Segmented radix sorting pass (one block per segment)
+ * @brief Segmented radix sorting pass (one block per segment)
+ *
+ * @tparam ChainedPolicyT
+ *   Chained tuning policy
+ *
+ * @tparam ALT_DIGIT_BITS
+ *   Whether or not to use the alternate (lower-bits) policy
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets \iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets \iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Input keys buffer
+ *
+ * @param[in] d_keys_out
+ *   Output keys buffer
+ *
+ * @param[in] d_values_in
+ *   Input values buffer
+ *
+ * @param[in] d_values_out
+ *   Output values buffer
+ *
+ * @param[in] d_begin_offsets
+ *   Random-access input iterator to the sequence of beginning offsets of length @p num_segments,
+ *   such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
+ *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+ *
+ * @param[in] d_end_offsets
+ *   Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+ *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+ *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.
+ *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>,
+ *   the <em>i</em><sup>th</sup> is considered empty.
+ *
+ * @param[in] num_segments
+ *   The number of segments that comprise the sorting data
+ *
+ * @param[in] current_bit
+ *   Bit position of current radix digit
+ *
+ * @param[in] pass_bits
+ *   Number of bits of current radix digit
  */
-template <
-    typename                ChainedPolicyT,                 ///< Chained tuning policy
-    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
-    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
-    typename                KeyT,                           ///< Key type
-    typename                ValueT,                         ///< Value type
-    typename                BeginOffsetIteratorT,           ///< Random-access input iterator type for reading segment beginning offsets \iterator
-    typename                EndOffsetIteratorT,             ///< Random-access input iterator type for reading segment ending offsets \iterator
-    typename                OffsetT,                        ///< Signed integer type for global offsets
-    typename                DecomposerT = detail::identity_decomposer_t>
-__launch_bounds__ (int((ALT_DIGIT_BITS) ?
-    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
-    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel(
-    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
-    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
-    const ValueT            *d_values_in,                   ///< [in] Input values buffer
-    ValueT                  *d_values_out,                  ///< [in] Output values buffer
-    BeginOffsetIteratorT    d_begin_offsets,                ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    EndOffsetIteratorT      d_end_offsets,                  ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
-    int                     current_bit,                    ///< [in] Bit position of current radix digit
-    int                     pass_bits,                      ///< [in] Number of bits of current radix digit
-    DecomposerT             decomposer = {})
+template <typename ChainedPolicyT,
+          bool ALT_DIGIT_BITS,
+          bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename DecomposerT = detail::identity_decomposer_t>
+__launch_bounds__(int((ALT_DIGIT_BITS)
+                        ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS
+                        : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceSegmentedRadixSortKernel(const KeyT *d_keys_in,
+                                      KeyT *d_keys_out,
+                                      const ValueT *d_values_in,
+                                      ValueT *d_values_out,
+                                      BeginOffsetIteratorT d_begin_offsets,
+                                      EndOffsetIteratorT d_end_offsets,
+                                      int /*num_segments*/,
+                                      int current_bit,
+                                      int pass_bits,
+                                      DecomposerT decomposer = {})
 {
     //
     // Constants
@@ -695,12 +874,18 @@ template <> struct sm90_small_key_tuning<2, 16, 8> { static constexpr int thread
  ******************************************************************************/
 
 /**
- * Tuning policy for kernel specialization
+ * @brief Tuning policy for kernel specialization
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename KeyT,          ///< Key type
-    typename ValueT,        ///< Value type
-    typename OffsetT>       ///< Signed integer type for global offsets
+template <typename KeyT, typename ValueT, typename OffsetT>
 struct DeviceRadixSortPolicy
 {
     //------------------------------------------------------------------------------
@@ -1192,17 +1377,41 @@ struct DispatchRadixSort : SelectedPolicy
     // Problem state
     //------------------------------------------------------------------------------
 
-    void                    *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-    DecomposerT             decomposer;
+    /// Device-accessible allocation of temporary storage.
+    //  When NULL, the required allocation size is written to @p temp_storage_bytes and no work is
+    //  done.
+    void *d_temp_storage;
+
+    /// Reference to size in bytes of @p d_temp_storage allocation
+    size_t &temp_storage_bytes;
+
+    /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+    /// updated to point to the sorted output keys
+    DoubleBuffer<KeyT> &d_keys;
+
+    /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+    /// updated to point to the sorted output values
+    DoubleBuffer<ValueT> &d_values;
+
+    /// Number of items to sort
+    OffsetT num_items;
+
+    /// The beginning (least-significant) bit index needed for key comparison
+    int begin_bit;
+
+    /// The past-the-end (most-significant) bit index needed for key comparison
+    int end_bit;
+
+    /// CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    cudaStream_t stream;
+
+    /// PTX version
+    int ptx_version;
+
+    /// Whether is okay to overwrite source buffers
+    bool is_overwrite_okay;
+
+    DecomposerT decomposer;
 
 
     //------------------------------------------------------------------------------
@@ -1271,13 +1480,21 @@ struct DispatchRadixSort : SelectedPolicy
     // Small-problem (single tile) invocation
     //------------------------------------------------------------------------------
 
-    /// Invoke a single block to sort in-core
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokeSingleTile(
-        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    /**
+     * @brief Invoke a single block to sort in-core
+     *
+     * @tparam ActivePolicyT
+     *   Umbrella policy active for the target device
+     *
+     * @tparam SingleTileKernelT
+     *   Function type of cub::DeviceRadixSortSingleTileKernel
+     *
+     * @param[in] single_tile_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+     */
+    template <typename ActivePolicyT, typename SingleTileKernelT>
+    CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+    InvokeSingleTile(SingleTileKernelT single_tile_kernel)
     {
         cudaError error = cudaSuccess;
         do
@@ -1786,19 +2003,47 @@ struct DispatchRadixSort : SelectedPolicy
         return error;
     }
 
-    /// Invocation (run multiple digit passes)
-    template <
-        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
-        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
-        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    /**
+     * @brief Invocation (run multiple digit passes)
+     *
+     * @tparam ActivePolicyT
+     *   Umbrella policy active for the target device
+     *
+     * @tparam UpsweepKernelT
+     *   Function type of cub::DeviceRadixSortUpsweepKernel
+     *
+     * @tparam ScanKernelT
+     *   Function type of cub::SpineScanKernel
+     *
+     * @tparam DownsweepKernelT
+     *   Function type of cub::DeviceRadixSortDownsweepKernel
+     *
+     * @param[in] upsweep_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+     *
+     * @param[in] alt_upsweep_kernel
+     *   Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+     *
+     * @param[in] scan_kernel
+     *   Kernel function pointer to parameterization of cub::SpineScanKernel
+     *
+     * @param[in] downsweep_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+     *
+     * @param[in] alt_downsweep_kernel
+     *   Alternate kernel function pointer to parameterization of
+     *   cub::DeviceRadixSortDownsweepKernel
+     */
+    template <typename ActivePolicyT,
+              typename UpsweepKernelT,
+              typename ScanKernelT,
+              typename DownsweepKernelT>
+    CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+    InvokePasses(UpsweepKernelT upsweep_kernel,
+                 UpsweepKernelT alt_upsweep_kernel,
+                 ScanKernelT scan_kernel,
+                 DownsweepKernelT downsweep_kernel,
+                 DownsweepKernelT alt_downsweep_kernel)
     {
         cudaError error = cudaSuccess;
         do
@@ -1858,9 +2103,14 @@ struct DispatchRadixSort : SelectedPolicy
             void* allocations[3] = {};
             size_t allocation_sizes[3] =
             {
-                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+                // bytes needed for privatized block digit histograms
+                spine_length * sizeof(OffsetT),                                         
+
+                // bytes needed for 3rd keys buffer
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     
+
+                // bytes needed for 3rd values buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    
             };
 
             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
@@ -2087,20 +2337,48 @@ struct DispatchRadixSort : SelectedPolicy
     //------------------------------------------------------------------------------
 
     /**
-     * Internal dispatch routine
+     * @brief Internal dispatch routine
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage. When NULL, the required
+     *   allocation size is written to @p temp_storage_bytes and no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in,out] d_keys
+     *   Double-buffer whose current buffer contains the unsorted input keys and,
+     *   upon return, is updated to point to the sorted output keys
+     *
+     * @param[in,out] d_values
+     *   Double-buffer whose current buffer contains the unsorted input values and,
+     *   upon return, is updated to point to the sorted output values
+     *
+     * @param[in] num_items
+     *   Number of items to sort
+     *
+     * @param[in] begin_bit
+     *   The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   The past-the-end (most-significant) bit index needed for key comparison
+     *
+     * @param[in] is_overwrite_okay
+     *   Whether is okay to overwrite source buffers
+     *
+     * @param[in] stream
+     *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
      */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        OffsetT                 num_items,              ///< [in] Number of items to sort
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        DecomposerT             decomposer = {})
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
+                                                                     size_t &temp_storage_bytes,
+                                                                     DoubleBuffer<KeyT> &d_keys,
+                                                                     DoubleBuffer<ValueT> &d_values,
+                                                                     OffsetT num_items,
+                                                                     int begin_bit,
+                                                                     int end_bit,
+                                                                     bool is_overwrite_okay,
+                                                                     cudaStream_t stream,
+                                                                     DecomposerT decomposer = {})
     {
         typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
 
@@ -2174,17 +2452,35 @@ struct DispatchRadixSort : SelectedPolicy
  ******************************************************************************/
 
 /**
- * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ * @brief Utility class for dispatching the appropriately-tuned kernels for segmented device-wide
+ * radix sort
+ *
+ * @tparam IS_DESCENDING
+ *   Whether or not the sorted-order is high-to-low
+ *
+ * @tparam KeyT
+ *   Key type
+ *
+ * @tparam ValueT
+ *   Value type
+ *
+ * @tparam BeginOffsetIteratorT
+ *   Random-access input iterator type for reading segment beginning offsets \iterator
+ *
+ * @tparam EndOffsetIteratorT
+ *   Random-access input iterator type for reading segment ending offsets \iterator
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
-    typename KeyT,              ///< Key type
-    typename ValueT,            ///< Value type
-    typename BeginOffsetIteratorT,   ///< Random-access input iterator type for reading segment beginning offsets \iterator
-    typename EndOffsetIteratorT,   ///< Random-access input iterator type for reading segment ending offsets \iterator
-    typename OffsetT,           ///< Signed integer type for global offsets
-    typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>,
-    typename DecomposerT = detail::identity_decomposer_t>
+template <bool IS_DESCENDING,
+          typename KeyT,
+          typename ValueT,
+          typename BeginOffsetIteratorT,
+          typename EndOffsetIteratorT,
+          typename OffsetT,
+          typename SelectedPolicy = DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>,
+          typename DecomposerT    = detail::identity_decomposer_t>
 struct DispatchSegmentedRadixSort : SelectedPolicy
 {
     //------------------------------------------------------------------------------
@@ -2198,21 +2494,54 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
     // Parameter members
     //------------------------------------------------------------------------------
 
-    void                    *d_temp_storage;        ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-    OffsetT                 num_items;              ///< [in] Number of items to sort
-    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
-    BeginOffsetIteratorT    d_begin_offsets;        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-    EndOffsetIteratorT      d_end_offsets;          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
-    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    int                     ptx_version;            ///< [in] PTX version
-    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
-    DecomposerT             decomposer;
+    /// Device-accessible allocation of temporary storage.  When NULL, the required allocation size
+    /// is written to @p temp_storage_bytes and no work is done.
+    void *d_temp_storage;
+
+    /// Reference to size in bytes of @p d_temp_storage allocation
+    size_t &temp_storage_bytes;
+
+    /// Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+    /// updated to point to the sorted output keys
+    DoubleBuffer<KeyT> &d_keys;
+
+    /// Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+    /// updated to point to the sorted output values
+    DoubleBuffer<ValueT> &d_values;
+
+    /// Number of items to sort
+    OffsetT num_items;
 
+    /// The number of segments that comprise the sorting data
+    OffsetT num_segments;
+
+    /// Random-access input iterator to the sequence of beginning offsets of length @p num_segments,
+    /// such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup>
+    /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    BeginOffsetIteratorT d_begin_offsets;
+
+    /// Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+    /// such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+    /// data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>. If <tt>d_end_offsets[i]-1</tt>
+    /// <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    EndOffsetIteratorT d_end_offsets;
+
+    /// The beginning (least-significant) bit index needed for key comparison
+    int begin_bit;
+
+    /// The past-the-end (most-significant) bit index needed for key comparison
+    int end_bit;
+
+    /// CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    cudaStream_t stream;
+
+    /// PTX version
+    int ptx_version;
+
+    /// Whether is okay to overwrite source buffers
+    bool is_overwrite_okay;
+
+    DecomposerT decomposer;
 
     //------------------------------------------------------------------------------
     // Constructors
@@ -2220,36 +2549,34 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
 
     /// Constructor
     CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchSegmentedRadixSort(
-        void*                   d_temp_storage,
-        size_t                  &temp_storage_bytes,
-        DoubleBuffer<KeyT>      &d_keys,
-        DoubleBuffer<ValueT>    &d_values,
-        OffsetT                 num_items,
-        OffsetT                 num_segments,
-        BeginOffsetIteratorT    d_begin_offsets,
-        EndOffsetIteratorT      d_end_offsets,
-        int                     begin_bit,
-        int                     end_bit,
-        bool                    is_overwrite_okay,
-        cudaStream_t            stream,
-        int                     ptx_version,
-        DecomposerT             decomposer = {})
-    :
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys(d_keys),
-        d_values(d_values),
-        num_items(num_items),
-        num_segments(num_segments),
-        d_begin_offsets(d_begin_offsets),
-        d_end_offsets(d_end_offsets),
-        begin_bit(begin_bit),
-        end_bit(end_bit),
-        stream(stream),
-        ptx_version(ptx_version),
-        is_overwrite_okay(is_overwrite_okay),
-        decomposer(decomposer)
+    DispatchSegmentedRadixSort(void *d_temp_storage,
+                                size_t &temp_storage_bytes,
+                                DoubleBuffer<KeyT> &d_keys,
+                                DoubleBuffer<ValueT> &d_values,
+                                OffsetT num_items,
+                                OffsetT num_segments,
+                                BeginOffsetIteratorT d_begin_offsets,
+                                EndOffsetIteratorT d_end_offsets,
+                                int begin_bit,
+                                int end_bit,
+                                bool is_overwrite_okay,
+                                cudaStream_t stream,
+                                int ptx_version,
+                                DecomposerT decomposer = {})
+        : d_temp_storage(d_temp_storage)
+        , temp_storage_bytes(temp_storage_bytes)
+        , d_keys(d_keys)
+        , d_values(d_values)
+        , num_items(num_items)
+        , num_segments(num_segments)
+        , d_begin_offsets(d_begin_offsets)
+        , d_end_offsets(d_end_offsets)
+        , begin_bit(begin_bit)
+        , end_bit(end_bit)
+        , stream(stream)
+        , ptx_version(ptx_version)
+        , is_overwrite_okay(is_overwrite_okay)
+        , decomposer(decomposer)
     {}
 
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
@@ -2376,15 +2703,25 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
         }
     };
 
-
-    /// Invocation (run multiple digit passes)
-    template <
-        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
-        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    cudaError_t InvokePasses(
-        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
-        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    /**
+     * @brief Invocation (run multiple digit passes)
+     *
+     * @tparam ActivePolicyT
+     *   Umbrella policy active for the target device
+     *
+     * @tparam SegmentedKernelT
+     *   Function type of cub::DeviceSegmentedRadixSortKernel
+     *
+     * @param[in] segmented_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+     *
+     * @param[in] alt_segmented_kernel
+     *   Alternate kernel function pointer to parameterization of
+     *   cub::DeviceSegmentedRadixSortKernel
+     */
+    template <typename ActivePolicyT, typename SegmentedKernelT>
+    CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+    InvokePasses(SegmentedKernelT segmented_kernel, SegmentedKernelT alt_segmented_kernel)
     {
         cudaError error = cudaSuccess;
         do
@@ -2398,8 +2735,11 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
             void* allocations[2] = {};
             size_t allocation_sizes[2] =
             {
-                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
-                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+                // bytes needed for 3rd keys buffer
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      
+
+                // bytes needed for 3rd values buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     
             };
 
             // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
@@ -2515,22 +2855,67 @@ struct DispatchSegmentedRadixSort : SelectedPolicy
     // Dispatch entrypoints
     //------------------------------------------------------------------------------
 
-
-    /// Internal dispatch routine
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                     num_items,              ///< [in] Number of items to sort
-        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
-        BeginOffsetIteratorT    d_begin_offsets,        ///< [in] Random-access input iterator to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
-        EndOffsetIteratorT      d_end_offsets,          ///< [in] Random-access input iterator to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
-        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
-        cudaStream_t            stream)                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    /**
+     * @brief Internal dispatch routine
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.  When NULL, the required allocation size
+     *   is written to @p temp_storage_bytes and no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in,out] d_keys
+     *   Double-buffer whose current buffer contains the unsorted input keys and, upon return, is
+     * updated to point to the sorted output keys
+     *
+     * @param[in,out] d_values
+     *   Double-buffer whose current buffer contains the unsorted input values and, upon return, is
+     *   updated to point to the sorted output values
+     *
+     * @param[in] num_items
+     *   Number of items to sort
+     *
+     * @param[in] num_segments
+     *   The number of segments that comprise the sorting data
+     *
+     * @param[in] d_begin_offsets
+     *   Random-access input iterator to the sequence of beginning offsets of length
+     *   @p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the
+     *   <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+     *
+     * @param[in] d_end_offsets
+     *   Random-access input iterator to the sequence of ending offsets of length @p num_segments,
+     *   such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup>
+     *   data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  
+     *   If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, 
+     *   the <em>i</em><sup>th</sup> is considered empty.
+     *
+     * @param[in] begin_bit
+     *   The beginning (least-significant) bit index needed for key comparison
+     *
+     * @param[in] end_bit
+     *   The past-the-end (most-significant) bit index needed for key comparison
+     *
+     * @param[in] is_overwrite_okay
+     *   Whether is okay to overwrite source buffers
+     *
+     * @param[in] stream
+     *   CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    Dispatch(void *d_temp_storage,
+             size_t &temp_storage_bytes,
+             DoubleBuffer<KeyT> &d_keys,
+             DoubleBuffer<ValueT> &d_values,
+             int num_items,
+             int num_segments,
+             BeginOffsetIteratorT d_begin_offsets,
+             EndOffsetIteratorT d_end_offsets,
+             int begin_bit,
+             int end_bit,
+             bool is_overwrite_okay,
+             cudaStream_t stream)
     {
         typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
 
diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
index fda0d518d58..119d1e33e39 100644
--- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
+++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -28,8 +28,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ * @file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector
+ * multiplication (SpMV).
  */
 
 #pragma once
@@ -69,13 +70,22 @@ CUB_NAMESPACE_BEGIN
  *****************************************************************************/
 
 /**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ *
+ * @tparam AgentSpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
  */
-template <typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type
-          typename ValueT,           ///< Matrix and vector value type
-          typename OffsetT>          ///< Signed integer type for sequence offsets
-CUB_DETAIL_KERNEL_ATTRIBUTES void
-DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params) ///< [in] SpMV input parameter bundle
+template <typename AgentSpmvPolicyT, typename ValueT, typename OffsetT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params)
 {
   typedef CacheModifiedInputIterator<AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, ValueT, OffsetT>
     VectorValueIteratorT;
@@ -100,17 +110,33 @@ DeviceSpmv1ColKernel(SpmvParams<ValueT, OffsetT> spmv_params) ///< [in] SpMV inp
 }
 
 /**
- * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ * @brief Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ *
+ * @tparam SpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam CoordinateT
+ *   Merge path coordinate type
+ *
+ * @tparam SpmvParamsT
+ *   SpmvParams type
+ *
+ * @param[in] num_merge_tiles
+ *   Number of SpMV merge tiles (spmv grid size)
+ *
+ * @param[out] d_tile_coordinates
+ *   Pointer to the temporary array of tile starting coordinates
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
  */
-template <
-    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
-    typename    OffsetT,                        ///< Signed integer type for sequence offsets
-    typename    CoordinateT,                    ///< Merge path coordinate type
-    typename    SpmvParamsT>                    ///< SpmvParams type
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
-    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
-    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
-    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+template <typename SpmvPolicyT, typename OffsetT, typename CoordinateT, typename SpmvParamsT>
+CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(int num_merge_tiles,
+                                                         CoordinateT *d_tile_coordinates,
+                                                         SpmvParamsT spmv_params)
 {
     /// Constants
     enum
@@ -148,26 +174,62 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel(
     }
 }
 
-
 /**
- * Spmv agent entry point
+ * @brief Spmv agent entry point
+ *
+ * @tparam SpmvPolicyT
+ *   Parameterized SpmvPolicy tuning policy type
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam CoordinateT
+ *   Merge path coordinate type
+ *
+ * @tparam HAS_ALPHA
+ *   Whether the input parameter Alpha is 1
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter Beta is 0
+ *
+ * @param[in] spmv_params
+ *   SpMV input parameter bundle
+ *
+ * @param[in] d_tile_coordinates
+ *   Pointer to the temporary array of tile starting coordinates
+ *
+ * @param[out] d_tile_carry_pairs
+ *   Pointer to the temporary array carry-out dot product row-ids, one per block
+ *
+ * @param[in] num_tiles
+ *   Number of merge tiles
+ *
+ * @param[in] tile_state
+ *   Tile status interface for fixup reduce-by-key kernel
+ *
+ * @param[in] num_segment_fixup_tiles
+ *   Number of reduce-by-key tiles (fixup grid size)
  */
-template <
-    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
-    typename        ScanTileStateT,             ///< Tile status interface type
-    typename        ValueT,                     ///< Matrix and vector value type
-    typename        OffsetT,                    ///< Signed integer type for sequence offsets
-    typename        CoordinateT,                ///< Merge path coordinate type
-    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
-    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
-__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel(
-    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
-    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
-    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
-    int                             num_tiles,                  ///< [in] Number of merge tiles
-    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
-    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+template <typename SpmvPolicyT,
+          typename ScanTileStateT,
+          typename ValueT,
+          typename OffsetT,
+          typename CoordinateT,
+          bool HAS_ALPHA,
+          bool HAS_BETA>
+__launch_bounds__(int(SpmvPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceSpmvKernel(SpmvParams<ValueT, OffsetT> spmv_params,
+                        CoordinateT *d_tile_coordinates,
+                        KeyValuePair<OffsetT, ValueT> *d_tile_carry_pairs,
+                        int num_tiles,
+                        ScanTileStateT tile_state,
+                        int num_segment_fixup_tiles)
 {
     // Spmv agent type specialization
     typedef AgentSpmv<
@@ -191,9 +253,17 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel(
 
 }
 
-template <typename ValueT,  ///< Matrix and vector value type
-          typename OffsetT, ///< Signed integer type for sequence offsets
-          bool HAS_BETA>    ///< Whether the input parameter Beta is 0
+/**
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for sequence offsets
+ *
+ * @tparam HAS_BETA
+ *   Whether the input parameter Beta is 0
+ */
+template <typename ValueT, typename OffsetT, bool HAS_BETA>
 CUB_DETAIL_KERNEL_ATTRIBUTES void
 DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
 {
@@ -213,21 +283,49 @@ DeviceSpmvEmptyMatrixKernel(SpmvParams<ValueT, OffsetT> spmv_params)
 }
 
 /**
- * Multi-block reduce-by-key sweep kernel entry point
+ * @brief Multi-block reduce-by-key sweep kernel entry point
+ *
+ * @tparam AgentSegmentFixupPolicyT
+ *   Parameterized AgentSegmentFixupPolicy tuning policy type
+ *
+ * @tparam PairsInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam AggregatesOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @param[in] d_pairs_in
+ *   Pointer to the array carry-out dot product row-ids, one per spmv block
+ *
+ * @param[in,out] d_aggregates_out
+ *   Output value aggregates
+ *
+ * @param[in] num_items
+ *   Total number of items to select from
+ *
+ * @param[in] num_tiles
+ *   Total number of tiles for the entire problem
+ *
+ * @param[in] tile_state
+ *   Tile status interface
  */
-template <
-    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
-    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
-    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
-    typename    OffsetT,                        ///< Signed integer type for global offsets
-    typename    ScanTileStateT>                 ///< Tile status interface type
-__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel(
-    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
-    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
-    OffsetT                     num_items,          ///< [in] Total number of items to select from
-    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
-    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+template <typename AgentSegmentFixupPolicyT,
+          typename PairsInputIteratorT,
+          typename AggregatesOutputIteratorT,
+          typename OffsetT,
+          typename ScanTileStateT>
+__launch_bounds__(int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceSegmentFixupKernel(PairsInputIteratorT d_pairs_in,
+                                AggregatesOutputIteratorT d_aggregates_out,
+                                OffsetT num_items,
+                                int num_tiles,
+                                ScanTileStateT tile_state)
 {
     // Thread block type for reducing tiles of value segments
     typedef AgentSegmentFixup<
@@ -255,11 +353,15 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel(
  ******************************************************************************/
 
 /**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ *
+ * @tparam ValueT
+ *   Matrix and vector value type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename    ValueT,                     ///< Matrix and vector value type
-    typename    OffsetT>                    ///< Signed integer type for global offsets
+template <typename ValueT, typename OffsetT>
 struct DispatchSpmv
 {
     //---------------------------------------------------------------------
@@ -489,26 +591,74 @@ struct DispatchSpmv
      *
      * If the input is larger than a single tile, this method uses two-passes of
      * kernel invocations.
+     *
+     * @tparam Spmv1ColKernelT
+     *   Function type of cub::DeviceSpmv1ColKernel
+     *
+     * @tparam SpmvSearchKernelT
+     *   Function type of cub::AgentSpmvSearchKernel
+     *
+     * @tparam SpmvKernelT
+     *   Function type of cub::AgentSpmvKernel
+     *
+     * @tparam SegmentFixupKernelT
+     *   Function type of cub::DeviceSegmentFixupKernelT
+     *
+     * @tparam SpmvEmptyMatrixKernelT
+     *   Function type of cub::DeviceSpmvEmptyMatrixKernel
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to
+     *   @p temp_storage_bytes and no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of \p d_temp_storage allocation
+     *
+     * @paramSpMV spmv_params
+     *   input parameter bundle
+     *
+     * @param[in] stream
+     *   CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
+     *
+     * @param[in] spmv_1col_kernel
+     *   Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+     *
+     * @param[in] spmv_search_kernel
+     *   Kernel function pointer to parameterization of AgentSpmvSearchKernel
+     *
+     * @param[in] spmv_kernel
+     *   Kernel function pointer to parameterization of AgentSpmvKernel
+     *
+     * @param[in] segment_fixup_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+     *
+     * @param[in] spmv_empty_matrix_kernel
+     *   Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel
+     *
+     * @param[in] spmv_config
+     *   Dispatch parameters that match the policy that @p spmv_kernel was compiled for
+     *
+     * @param[in] segment_fixup_config
+     *   Dispatch parameters that match the policy that @p segment_fixup_kernel was compiled for
      */
-    template <
-        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
-        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
-        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
-        typename                SegmentFixupKernelT,                ///< Function type of cub::DeviceSegmentFixupKernelT
-        typename                SpmvEmptyMatrixKernelT>             ///< Function type of cub::DeviceSpmvEmptyMatrixKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
-        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
-        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
-        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
-        SpmvEmptyMatrixKernelT  spmv_empty_matrix_kernel,           ///< [in] Kernel function pointer to parameterization of cub::DeviceSpmvEmptyMatrixKernel
-        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
-        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    template <typename Spmv1ColKernelT,
+              typename SpmvSearchKernelT,
+              typename SpmvKernelT,
+              typename SegmentFixupKernelT,
+              typename SpmvEmptyMatrixKernelT>
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    Dispatch(void *d_temp_storage,
+             size_t &temp_storage_bytes,
+             SpmvParamsT &spmv_params,
+             cudaStream_t stream,
+             Spmv1ColKernelT spmv_1col_kernel,
+             SpmvSearchKernelT spmv_search_kernel,
+             SpmvKernelT spmv_kernel,
+             SegmentFixupKernelT segment_fixup_kernel,
+             SpmvEmptyMatrixKernelT spmv_empty_matrix_kernel,
+             KernelConfig spmv_config,
+             KernelConfig segment_fixup_config)
     {
         cudaError error = cudaSuccess;
         do
@@ -821,14 +971,26 @@ struct DispatchSpmv
     }
 
     /**
-     * Internal dispatch routine for computing a device-wide reduction
+     * @brief Internal dispatch routine for computing a device-wide reduction
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to
+     *   @p temp_storage_bytes and no work is done.
+     *
+     * @param[in,out] temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param SpMV spmv_params
+     *   input parameter bundle
+     *
+     * @param[in] stream
+     *   <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
      */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,                     ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
-        cudaStream_t            stream = 0)                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(void *d_temp_storage,
+                                                                     size_t &temp_storage_bytes,
+                                                                     SpmvParamsT &spmv_params,
+                                                                     cudaStream_t stream = 0)
     {
         cudaError error = cudaSuccess;
         do
@@ -875,5 +1037,3 @@ struct DispatchSpmv
 
 
 CUB_NAMESPACE_END
-
-
diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
index 36ab55be61c..c006e359044 100644
--- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
+++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory.
+ * @file
+ * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique
+ * items by key from sequences of data items residing within device-accessible memory.
  */
 
 #include <cub/agent/agent_unique_by_key.cuh>
@@ -47,29 +48,81 @@ CUB_NAMESPACE_BEGIN
  *****************************************************************************/
 
 /**
- * Unique by key kernel entry point (multi-block)
+ * @brief Unique by key kernel entry point (multi-block)
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam ScanTileStateT
+ *   Tile status interface type
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
+ *
+ * @param[in] d_keys_in
+ *   Pointer to the input sequence of keys
+ *
+ * @param[in] d_values_in
+ *   Pointer to the input sequence of values
+ *
+ * @param[out] d_keys_out
+ *   Pointer to the output sequence of selected data items
+ *
+ * @param[out] d_values_out
+ *   Pointer to the output sequence of selected data items
+ *
+ * @param[out] d_num_selected_out
+ *   Pointer to the total number of items selected
+ *   (i.e., length of @p d_keys_out or @p d_values_out)
+ *
+ * @param[in] tile_state
+ *   Tile status interface
+ *
+ * @param[in] equality_op
+ *   Equality operator
+ *
+ * @param[in] num_items
+ *   Total number of input items
+ *   (i.e., length of @p d_keys_in or @p d_values_in)
+ *
+ * @param[in] num_tiles
+ *   Total number of tiles for the entire problem
  */
-template <
-    typename ChainedPolicyT,
-    typename KeyInputIteratorT,                     ///< Random-access input iterator type for keys
-    typename ValueInputIteratorT,                   ///< Random-access input iterator type for values
-    typename KeyOutputIteratorT,                    ///< Random-access output iterator type for keys
-    typename ValueOutputIteratorT,                  ///< Random-access output iterator type for values
-    typename NumSelectedIteratorT,                  ///< Output iterator type for recording the number of items selected
-    typename ScanTileStateT,                        ///< Tile status interface type
-    typename EqualityOpT,                           ///< Equality operator type
-    typename OffsetT>                               ///< Signed integer type for global offsets
-__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS))
-CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
-    KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
-    ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
-    KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
-    ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
-    ScanTileStateT          tile_state,             ///< [in] Tile status interface
-    EqualityOpT             equality_op,            ///< [in] Equality operator
-    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
-    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+template <typename ChainedPolicyT,
+          typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename ScanTileStateT,
+          typename EqualityOpT,
+          typename OffsetT>
+__launch_bounds__(int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS))
+  CUB_DETAIL_KERNEL_ATTRIBUTES
+  void DeviceUniqueByKeySweepKernel(KeyInputIteratorT d_keys_in,
+                                    ValueInputIteratorT d_values_in,
+                                    KeyOutputIteratorT d_keys_out,
+                                    ValueOutputIteratorT d_values_out,
+                                    NumSelectedIteratorT d_num_selected_out,
+                                    ScanTileStateT tile_state,
+                                    EqualityOpT equality_op,
+                                    OffsetT num_items,
+                                    int num_tiles)
 {
     using AgentUniqueByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT;
 
@@ -98,17 +151,37 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel(
  ******************************************************************************/
 
 /**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ * @brief Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ *
+ * @tparam KeyInputIteratorT
+ *   Random-access input iterator type for keys
+ *
+ * @tparam ValueInputIteratorT
+ *   Random-access input iterator type for values
+ *
+ * @tparam KeyOutputIteratorT
+ *   Random-access output iterator type for keys
+ *
+ * @tparam ValueOutputIteratorT
+ *   Random-access output iterator type for values
+ *
+ * @tparam NumSelectedIteratorT
+ *   Output iterator type for recording the number of items selected
+ *
+ * @tparam EqualityOpT
+ *   Equality operator type
+ *
+ * @tparam OffsetT
+ *   Signed integer type for global offsets
  */
-template <
-    typename KeyInputIteratorT,                 ///< Random-access input iterator type for keys
-    typename ValueInputIteratorT,               ///< Random-access input iterator type for values
-    typename KeyOutputIteratorT,                ///< Random-access output iterator type for keys
-    typename ValueOutputIteratorT,              ///< Random-access output iterator type for values
-    typename NumSelectedIteratorT,              ///< Output iterator type for recording the number of items selected
-    typename EqualityOpT,                       ///< Equality operator type
-    typename OffsetT,                           ///< Signed integer type for global offsets
-    typename SelectedPolicy = DeviceUniqueByKeyPolicy<KeyInputIteratorT, ValueInputIteratorT>>
+template <typename KeyInputIteratorT,
+          typename ValueInputIteratorT,
+          typename KeyOutputIteratorT,
+          typename ValueOutputIteratorT,
+          typename NumSelectedIteratorT,
+          typename EqualityOpT,
+          typename OffsetT,
+          typename SelectedPolicy = DeviceUniqueByKeyPolicy<KeyInputIteratorT, ValueInputIteratorT>>
 struct DispatchUniqueByKey : SelectedPolicy
 {
     /******************************************************************************
@@ -127,41 +200,93 @@ struct DispatchUniqueByKey : SelectedPolicy
     // Tile status descriptor interface type
     using ScanTileStateT = ScanTileState<OffsetT>;
 
+    /// Device-accessible allocation of temporary storage.  When NULL, the required allocation size
+    /// is written to @p temp_storage_bytes and no work is done.
+    void *d_temp_storage;
 
-    void*                   d_temp_storage;             ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t&                 temp_storage_bytes;         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-    KeyInputIteratorT       d_keys_in;                  ///< [in] Pointer to the input sequence of keys
-    ValueInputIteratorT     d_values_in;                ///< [in] Pointer to the input sequence of values
-    KeyOutputIteratorT      d_keys_out;                 ///< [out] Pointer to the output sequence of selected data items
-    ValueOutputIteratorT    d_values_out;               ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIteratorT    d_num_selected_out;         ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
-    EqualityOpT             equality_op;                ///< [in] Equality operator
-    OffsetT                 num_items;                  ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
-    cudaStream_t            stream;                     ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    /// Reference to size in bytes of @p d_temp_storage allocation
+    size_t &temp_storage_bytes;
 
-    CUB_RUNTIME_FUNCTION __forceinline__
-    DispatchUniqueByKey(
-        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t&                 temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
-        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
-        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
-        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
-        EqualityOpT             equality_op,            ///< [in] Equality operator
-        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in)
-        cudaStream_t            stream                  ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-    ):
-        d_temp_storage(d_temp_storage),
-        temp_storage_bytes(temp_storage_bytes),
-        d_keys_in(d_keys_in),
-        d_values_in(d_values_in),
-        d_keys_out(d_keys_out),
-        d_values_out(d_values_out),
-        d_num_selected_out(d_num_selected_out),
-        equality_op(equality_op),
-        num_items(num_items),
-        stream(stream)
+    /// Pointer to the input sequence of keys
+    KeyInputIteratorT d_keys_in;
+
+    /// Pointer to the input sequence of values
+    ValueInputIteratorT d_values_in;
+
+    /// Pointer to the output sequence of selected data items
+    KeyOutputIteratorT d_keys_out;
+
+    /// Pointer to the output sequence of selected data items
+    ValueOutputIteratorT d_values_out;
+
+    /// Pointer to the total number of items selected 
+    /// (i.e., length of @p d_keys_out or @p d_values_out)
+    NumSelectedIteratorT d_num_selected_out;
+
+    /// Equality operator
+    EqualityOpT equality_op;
+
+    /// Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
+    OffsetT num_items;
+
+    /// <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    cudaStream_t stream;
+
+    /**
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to
+     *   @p temp_storage_bytes and no work is done.
+     *
+     * @tparam temp_storage_bytes
+     *   [in,out] Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_keys_in
+     *   Pointer to the input sequence of keys
+     *
+     * @param[in] d_values_in
+     *   Pointer to the input sequence of values
+     *
+     * @param[out] d_keys_out
+     *   Pointer to the output sequence of selected data items
+     *
+     * @param[out] d_values_out
+     *   Pointer to the output sequence of selected data items
+     *
+     * @param[out] d_num_selected_out
+     *   Pointer to the total number of items selected
+     *   (i.e., length of @p d_keys_out or @p d_values_out)
+     *
+     * @param[in] equality_op
+     *   Equality operator
+     *
+     * @param[in] num_items
+     *   Total number of input items (i.e., length of @p d_keys_in or @p d_values_in)
+     *
+     * @param[in] stream
+     *   <b>[optional]</b> CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__ DispatchUniqueByKey(void *d_temp_storage,
+                                                             size_t &temp_storage_bytes,
+                                                             KeyInputIteratorT d_keys_in,
+                                                             ValueInputIteratorT d_values_in,
+                                                             KeyOutputIteratorT d_keys_out,
+                                                             ValueOutputIteratorT d_values_out,
+                                                             NumSelectedIteratorT d_num_selected_out,
+                                                             EqualityOpT equality_op,
+                                                             OffsetT num_items,
+                                                             cudaStream_t stream)
+        : d_temp_storage(d_temp_storage)
+        , temp_storage_bytes(temp_storage_bytes)
+        , d_keys_in(d_keys_in)
+        , d_values_in(d_values_in)
+        , d_keys_out(d_keys_out)
+        , d_values_out(d_values_out)
+        , d_num_selected_out(d_num_selected_out)
+        , equality_op(equality_op)
+        , num_items(num_items)
+        , stream(stream)
     {}
 
     CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED
@@ -397,22 +522,54 @@ struct DispatchUniqueByKey : SelectedPolicy
         );
     }
 
-
     /**
-    * Internal dispatch routine
-    */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void*                   d_temp_storage,         ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeyInputIteratorT       d_keys_in,              ///< [in] Pointer to the input sequence of keys
-        ValueInputIteratorT     d_values_in,            ///< [in] Pointer to the input sequence of values
-        KeyOutputIteratorT      d_keys_out,             ///< [out] Pointer to the output sequence of selected data items
-        ValueOutputIteratorT    d_values_out,           ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out)
-        EqualityOpT             equality_op,            ///< [in] Equality operator
-        OffsetT                 num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t            stream)                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+     * @brief Internal dispatch routine
+     *
+     * @param[in] d_temp_storage
+     *   Device-accessible allocation of temporary storage.
+     *   When NULL, the required allocation size is written to
+     *   @p temp_storage_bytes and no work is done.
+     *
+     * @param[in,out] &temp_storage_bytes
+     *   Reference to size in bytes of @p d_temp_storage allocation
+     *
+     * @param[in] d_keys_in
+     *   Pointer to the input sequence of keys
+     *
+     * @param[in] d_values_in
+     *   Pointer to the input sequence of values
+     *
+     * @param[out] d_keys_out
+     *   Pointer to the output sequence of selected data items
+     *
+     * @param[out] d_values_out
+     *   Pointer to the output sequence of selected data items
+     *
+     * @param[out] d_num_selected_out
+     *   Pointer to the total number of items selected
+     *   (i.e., length of @p d_keys_out or @p d_values_out)
+     *
+     * @param[in] equality_op
+     *   Equality operator
+     *
+     * @param[in] num_items
+     *   Total number of input items (i.e., the length of @p d_in)
+     *
+     * @param[in] stream
+     *   <b>[optional]</b> CUDA stream to launch kernels within.
+     *   Default is stream<sub>0</sub>.
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t
+    Dispatch(void *d_temp_storage,
+             size_t &temp_storage_bytes,
+             KeyInputIteratorT d_keys_in,
+             ValueInputIteratorT d_values_in,
+             KeyOutputIteratorT d_keys_out,
+             ValueOutputIteratorT d_values_out,
+             NumSelectedIteratorT d_num_selected_out,
+             EqualityOpT equality_op,
+             OffsetT num_items,
+             cudaStream_t stream)
     {
         using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy;
 
diff --git a/cub/cub/grid/grid_even_share.cuh b/cub/cub/grid/grid_even_share.cuh
index f86b885a5c8..bf990ecdddd 100644
--- a/cub/cub/grid/grid_even_share.cuh
+++ b/cub/cub/grid/grid_even_share.cuh
@@ -27,11 +27,12 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ * @file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an
+ * "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units
+ * (grains).
  */
 
-
 #pragma once
 
 #include "../config.cuh"
@@ -50,17 +51,17 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup GridModule
+ * @addtogroup GridModule
  * @{
  */
 
 
 /**
- * \brief GridEvenShare is a descriptor utility for distributing input among
+ * @brief GridEvenShare is a descriptor utility for distributing input among
  * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
  * the same number of input tiles.
  *
- * \par Overview
+ * @par Overview
  * Each thread block is assigned a consecutive sequence of input tiles.  To help
  * preserve alignment and eliminate the overhead of guarded loads for all but the
  * last thread block, to GridEvenShare assigns one of three different amounts of
@@ -69,7 +70,7 @@ CUB_NAMESPACE_BEGIN
  * last thread block may be partially-full if the input is not an even multiple of
  * the scheduling grain size.
  *
- * \par
+ * @par
  * Before invoking a child grid, a parent thread will typically construct an
  * instance of GridEvenShare.  The instance can be passed to child thread blocks
  * which can initialize their per-thread block offsets using \p BlockInit().
@@ -119,14 +120,22 @@ public:
         block_stride(0)
     {}
 
-
     /**
-     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     * @brief Dispatch initializer. To be called prior prior to kernel launch.
+     *
+     * @param num_items_
+     *   Total number of input items
+     *
+     * @param max_grid_size
+     *   Maximum grid size allowable (actual grid size may be less if not warranted by the the
+     *   number of input items)
+     *
+     * @param tile_items
+     *   Number of data items per input tile
      */
-    __host__ __device__ __forceinline__ void DispatchInit(
-        OffsetT num_items_,          ///< Total number of input items
-        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
-        int     tile_items)         ///< Number of data items per input tile
+    __host__ __device__ __forceinline__ void DispatchInit(OffsetT num_items_,
+                                                          int max_grid_size,
+                                                          int tile_items)
     {
         this->block_offset          = num_items_;    // Initialize past-the-end
         this->block_end             = num_items_;    // Initialize past-the-end
@@ -141,16 +150,14 @@ public:
         this->big_share_items       = normal_share_items + tile_items;
     }
 
-
     /**
-     * \brief Initializes ranges for the specified thread block index.  Specialized
-     * for a "raking" access pattern in which each thread block is assigned a
-     * consecutive sequence of input tiles.
+     * @brief Initializes ranges for the specified thread block index. Specialized
+     *        for a "raking" access pattern in which each thread block is assigned a
+     *        consecutive sequence of input tiles.
      */
     template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    __device__ __forceinline__ void BlockInit(int block_id,
+                                              Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
     {
         block_stride = TILE_ITEMS;
         if (block_id < big_shares)
@@ -169,46 +176,44 @@ public:
         // Else default past-the-end
     }
 
-
     /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
+     * @brief Block-initialization, specialized for a "raking" access
+     *        pattern in which each thread block is assigned a consecutive sequence
+     *        of input tiles.
      */
     template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        int block_id,
-        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    __device__ __forceinline__ void BlockInit(int block_id,
+                                              Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
     {
         block_stride = grid_size * TILE_ITEMS;
         block_offset = (block_id * TILE_ITEMS);
         block_end = num_items;
     }
 
-
     /**
-     * \brief Block-initialization, specialized for "strip mining" access
-     * pattern in which the input tiles assigned to each thread block are
-     * separated by a stride equal to the the extent of the grid.
+     * @brief Block-initialization, specialized for "strip mining" access
+     *        pattern in which the input tiles assigned to each thread block are
+     *        separated by a stride equal to the the extent of the grid.
      */
-    template <
-        int TILE_ITEMS,
-        GridMappingStrategy STRATEGY>
+    template <int TILE_ITEMS, GridMappingStrategy STRATEGY>
     __device__ __forceinline__ void BlockInit()
     {
         BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
     }
 
-
     /**
-     * \brief Block-initialization, specialized for a "raking" access
-     * pattern in which each thread block is assigned a consecutive sequence
-     * of input tiles.
+     * @brief Block-initialization, specialized for a "raking" access
+     *        pattern in which each thread block is assigned a consecutive sequence
+     *        of input tiles.
+     *
+     * @param[in] block_offset
+     *   Threadblock begin offset (inclusive)
+     *
+     * @param[in] block_end
+     *   Threadblock end offset (exclusive)
      */
     template <int TILE_ITEMS>
-    __device__ __forceinline__ void BlockInit(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    __device__ __forceinline__ void BlockInit(OffsetT block_offset, OffsetT block_end)
     {
         this->block_offset = block_offset;
         this->block_end = block_end;
@@ -220,8 +225,6 @@ public:
 
 
 
-
-
 /** @} */       // end group GridModule
 
 CUB_NAMESPACE_END
diff --git a/cub/cub/grid/grid_queue.cuh b/cub/cub/grid/grid_queue.cuh
index 7a8f2887cce..be6d02ed2c7 100644
--- a/cub/cub/grid/grid_queue.cuh
+++ b/cub/cub/grid/grid_queue.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * cub::GridQueue is a descriptor utility for dynamic queue management.
  */
 
@@ -49,26 +49,26 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup GridModule
+ * @addtogroup GridModule
  * @{
  */
 
 
 /**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
+ * @brief GridQueue is a descriptor utility for dynamic queue management.
  *
- * \par Overview
+ * @par Overview
  * GridQueue descriptors provides abstractions for "filling" or
  * "draining" globally-shared vectors.
  *
- * \par
+ * @par
  * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
  * returning a unique offset for the calling thread to write its items.
  * The GridQueue maintains the total "fill-size".  The fill counter must be reset
  * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
  * will be filling.
  *
- * \par
+ * @par
  * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
  * zero-initialized counter, returning a unique offset for the calling thread to
  * read its items. Threads can safely drain until the array's logical fill-size is
@@ -77,11 +77,11 @@ CUB_NAMESPACE_BEGIN
  * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
  * is simply the number of elements in the array.)
  *
- * \par
+ * @par
  * Iterative work management can be implemented simply with a pair of flip-flopping
  * work buffers, each with an associated set of fill and drain GridQueue descriptors.
  *
- * \tparam OffsetT Signed integer type for global offsets
+ * @tparam OffsetT Signed integer type for global offsets
  */
 template <typename OffsetT>
 class GridQueue
@@ -114,16 +114,20 @@ public:
         d_counters(NULL)
     {}
 
-
-    /// Constructs a GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((OffsetT*) d_storage)
+    /**
+     * @brief Constructs a GridQueue descriptor around the device storage allocation
+     *
+     * @param d_storage
+     *   Device allocation to back the GridQueue.  Must be at least as big as
+     *   <tt>AllocationSize()</tt>.
+     */
+    __host__ __device__ __forceinline__ GridQueue(void *d_storage)
+        : d_counters((OffsetT *)d_storage)
     {}
 
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for
+    /// draining in the next kernel instance. To be called by the host or by a kernel prior to that
+    /// which will be draining.
     __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
         OffsetT fill_size,
         cudaStream_t stream = 0)
@@ -146,8 +150,8 @@ public:
         return result;
     }
 
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  
+    /// To be called by the host or by a kernel prior to that which will be draining.
     __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
@@ -165,7 +169,8 @@ public:
     }
 
 
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    /// This operation resets the fill counter.  
+    /// To be called by the host or by a kernel prior to that which will be filling.
     __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
     {
         cudaError_t result = cudaErrorUnknown;
@@ -203,14 +208,16 @@ public:
     }
 
 
-    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    /// Drain @p num_items from the queue. Returns offset from which to read items.  
+    /// To be called from CUDA kernel.
     __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
     {
         return atomicAdd(d_counters + DRAIN, num_items);
     }
 
 
-    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    /// Fill @p num_items into the queue. Returns offset from which to write items.    
+    /// To be called from CUDA kernel.
     __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
     {
         return atomicAdd(d_counters + FILL, num_items);
diff --git a/cub/cub/iterator/arg_index_input_iterator.cuh b/cub/cub/iterator/arg_index_input_iterator.cuh
index d895a53e72f..99ea0da98d4 100644
--- a/cub/cub/iterator/arg_index_input_iterator.cuh
+++ b/cub/cub/iterator/arg_index_input_iterator.cuh
@@ -58,29 +58,29 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
-
 /**
- * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ * @brief A random-access input wrapper for pairing dereferenced values with their corresponding
+ *        indices (forming \p KeyValuePair tuples).
  *
- * \par Overview
- * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
- *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
- *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * @par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator @p itr of type @p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset @p i produces a @p KeyValuePair value whose
+ *   @p key field is @p i and whose @p value field is <tt>itr[i]</tt>.
  * - Can be used with any data type.
  * - Can be constructed, manipulated, and exchanged within and between host and device
  *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
  *   device memory can only be dereferenced on the device.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * @par Snippet
+ * The code snippet below illustrates the use of @p ArgIndexInputIteratorTto
  * dereference an array of doubles
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
  *
  * // Declare, allocate, and initialize a device array
@@ -102,11 +102,16 @@ CUB_NAMESPACE_BEGIN
  *   item_offset_pair.value,
  *   item_offset_pair.key);   // 9.0 @ 6
  *
- * \endcode
+ * @endcode
+ *
+ * @tparam InputIteratorT
+ *   The value type of the wrapped input iterator
+ *
+ * @tparam OffsetT
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  *
- * \tparam InputIteratorT       The value type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
- * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ * @tparam OutputValueT
+ *   The paired value type of the <offset,value> tuple (Default: value type of input iterator)
  */
 template <
     typename    InputIteratorT,
@@ -117,22 +122,35 @@ class ArgIndexInputIterator
 public:
 
     // Required iterator traits
-    typedef ArgIndexInputIterator                       self_type;              ///< My own type
-    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
-    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef ArgIndexInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef value_type *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef value_type reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::any_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -142,21 +160,25 @@ private:
 
 public:
 
-    /// Constructor
-    __host__ __device__ __forceinline__ ArgIndexInputIterator(
-        InputIteratorT  itr,            ///< Input iterator to wrap
-        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
-    :
-        itr(itr),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
+    /**
+     * @param itr 
+     *   Input iterator to wrap
+     *
+     * @param offset 
+     *   OffsetT (in items) from @p itr denoting the position of the iterator
+     */
+  __host__ __device__ __forceinline__ ArgIndexInputIterator(InputIteratorT itr,
+                                                            difference_type offset = 0)
+      : itr(itr)
+      , offset(offset)
+  {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int)
+  {
+    self_type retval = *this;
+    offset++;
+    return retval;
     }
 
     /// Prefix increment
diff --git a/cub/cub/iterator/cache_modified_input_iterator.cuh b/cub/cub/iterator/cache_modified_input_iterator.cuh
index b42e5b3cb49..fab19a66b34 100644
--- a/cub/cub/iterator/cache_modified_input_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_input_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -59,30 +59,30 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
-
 /**
- * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ * @brief A random-access input wrapper for dereferencing array values using a PTX cache load
+ *        modifier.
  *
- * \par Overview
+ * @par Overview
  * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ *   device pointer of type <tt>ValueType*</tt>. @p ValueType references are
+ *   made by reading @p ValueType values through loads modified by @p MODIFIER.
  * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
  *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
  * - Can be constructed, manipulated, and exchanged within and between host and device
  *   functions, but can only be dereferenced within device functions.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CacheModifiedInputIterator to
  * dereference a device array of double using the "ldg" PTX load modifier
  * (i.e., load values through texture cache).
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
  *
  * // Declare, allocate, and initialize a device array
@@ -96,11 +96,16 @@ CUB_NAMESPACE_BEGIN
  * printf("%f\n", itr[1]);  // 6.0
  * printf("%f\n", itr[6]);  // 9.0
  *
- * \endcode
+ * @endcode
+ *
+ * @tparam CacheLoadModifier    
+ *   The cub::CacheLoadModifier to use when accessing data
+ *
+ * @tparam ValueType            
+ *   The value type of this iterator
  *
- * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     CacheLoadModifier   MODIFIER,
@@ -111,22 +116,35 @@ class CacheModifiedInputIterator
 public:
 
     // Required iterator traits
-    typedef CacheModifiedInputIterator          self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef CacheModifiedInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef ValueType value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef ValueType *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef ValueType reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::device_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 
diff --git a/cub/cub/iterator/cache_modified_output_iterator.cuh b/cub/cub/iterator/cache_modified_output_iterator.cuh
index 92024af04d8..0e0c1fc0c01 100644
--- a/cub/cub/iterator/cache_modified_output_iterator.cuh
+++ b/cub/cub/iterator/cache_modified_output_iterator.cuh
@@ -58,30 +58,30 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 
 /**
- * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ * @brief A random-access output wrapper for storing array values using a PTX cache-modifier.
  *
- * \par Overview
+ * @par Overview
  * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ *   device pointer of type <tt>ValueType*</tt>. @p ValueType references are
+ *   made by writing @p ValueType values through stores modified by @p MODIFIER.
  * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
  *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
  * - Can be constructed, manipulated, and exchanged within and between host and device
  *   functions, but can only be dereferenced within device functions.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CacheModifiedOutputIterator to
  * dereference a device array of doubles using the "wt" PTX load modifier
  * (i.e., write-through to system memory).
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
  *
  * // Declare, allocate, and initialize a device array
@@ -95,14 +95,19 @@ CUB_NAMESPACE_BEGIN
  * itr[1]  = 66.0;
  * itr[55] = 24.0;
  *
- * \endcode
+ * @endcode
  *
- * \par Usage Considerations
+ * @par Usage Considerations
  * - Can only be dereferenced within device code
  *
- * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam CacheStoreModifier     
+ *   The cub::CacheStoreModifier to use when accessing data
+ *
+ * @tparam ValueType            
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     CacheStoreModifier  MODIFIER,
@@ -131,22 +136,35 @@ private:
 public:
 
     // Required iterator traits
-    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                                value_type;             ///< The type of the element the iterator can point to
-    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef CacheModifiedOutputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef void value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef void pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef Reference reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::device_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -154,13 +172,13 @@ private:
     ValueType* ptr;
 
 public:
-
-    /// Constructor
+    /**
+     * @param ptr
+     *   Native pointer to wrap
+     */
     template <typename QualifiedValueType>
-    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
-        QualifiedValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(QualifiedValueType *ptr)
+        : ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type *>(ptr))
     {}
 
     /// Postfix increment
diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh
index 38816f5cbe1..86c1ce12753 100644
--- a/cub/cub/iterator/constant_input_iterator.cuh
+++ b/cub/cub/iterator/constant_input_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -58,27 +58,27 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 
 /**
- * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ * @brief A random-access input generator for dereferencing a sequence of homogeneous values
  *
- * \par Overview
+ * @par Overview
  * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
- *   of type \p ValueType.
+ *   of type @p ValueType.
  * - Can be used with any data type.
  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
  *   functions.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * @par Snippet
+ * The code snippet below illustrates the use of @p ConstantInputIteratorTto
  * dereference a sequence of homogeneous doubles.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
  *
  * cub::ConstantInputIterator<double> itr(5.0);
@@ -88,10 +88,13 @@ CUB_NAMESPACE_BEGIN
  * printf("%f\n", itr[2]);      // 5.0
  * printf("%f\n", itr[50]);     // 5.0
  *
- * \endcode
+ * @endcode
  *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam ValueType            
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     typename ValueType,
@@ -101,22 +104,35 @@ class ConstantInputIterator
 public:
 
     // Required iterator traits
-    typedef ConstantInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef ConstantInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef ValueType value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef ValueType *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef ValueType reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::any_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -124,26 +140,29 @@ private:
     ValueType   val;
     OffsetT     offset;
 #ifdef _WIN32
-    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        
 #endif
 
 public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantInputIterator(
-        ValueType   val,            ///< Starting value for the iterator instance to report
-        OffsetT     offset = 0)     ///< Base offset
-    :
-        val(val),
-        offset(offset)
+    /**
+     * @param val
+     *   Starting value for the iterator instance to report
+     *
+     * @param offset
+     *   Base offset
+     */
+    __host__ __device__ __forceinline__ ConstantInputIterator(ValueType val, OffsetT offset = 0)
+        : val(val)
+        , offset(offset)
     {}
 
     /// Postfix increment
     __host__ __device__ __forceinline__ self_type operator++(int)
     {
-        self_type retval = *this;
-        offset++;
-        return retval;
+      self_type retval = *this;
+      offset++;
+      return retval;
     }
 
     /// Prefix increment
diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh
index dc0108ac6f8..25284df1cdc 100644
--- a/cub/cub/iterator/counting_input_iterator.cuh
+++ b/cub/cub/iterator/counting_input_iterator.cuh
@@ -57,25 +57,25 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 /**
- * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ * @brief A random-access input generator for dereferencing a sequence of incrementing integer values.
  *
- * \par Overview
- * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
- *   at \p offset will return the value \p base + \p offset.
+ * @par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer @p base, read references
+ *   at @p offset will return the value @p base + @p offset.
  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
  *   functions.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * @par Snippet
+ * The code snippet below illustrates the use of @p CountingInputIteratorTto
  * dereference a sequence of incrementing integers.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
  *
  * cub::CountingInputIterator<int> itr(5);
@@ -85,10 +85,13 @@ CUB_NAMESPACE_BEGIN
  * printf("%d\n", itr[2]);      // 7
  * printf("%d\n", itr[50]);     // 55
  *
- * \endcode
+ * @endcode
  *
- * \tparam ValueType            The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam ValueType            
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     typename ValueType,
@@ -98,22 +101,35 @@ class CountingInputIterator
 public:
 
     // Required iterator traits
-    typedef CountingInputIterator               self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef CountingInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef ValueType value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef ValueType *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef ValueType reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::any_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -121,20 +137,20 @@ private:
     ValueType val;
 
 public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CountingInputIterator(
-        const ValueType &val)          ///< Starting value for the iterator instance to report
-    :
-        val(val)
+    /**
+     * @param val
+     *   Starting value for the iterator instance to report
+     */
+    __host__ __device__ __forceinline__ CountingInputIterator(const ValueType &val)
+        : val(val)
     {}
 
     /// Postfix increment
     __host__ __device__ __forceinline__ self_type operator++(int)
     {
-        self_type retval = *this;
-        val++;
-        return retval;
+      self_type retval = *this;
+      val++;
+      return retval;
     }
 
     /// Prefix increment
diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh
index 66e764412b5..33048473595 100644
--- a/cub/cub/iterator/discard_output_iterator.cuh
+++ b/cub/cub/iterator/discard_output_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -55,13 +55,13 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 
 /**
- * \brief A discard iterator
+ * @brief A discard iterator
  */
 template <typename OffsetT = ptrdiff_t>
 class DiscardOutputIterator
@@ -69,22 +69,35 @@ class DiscardOutputIterator
 public:
 
     // Required iterator traits
-    typedef DiscardOutputIterator   self_type;              ///< My own type
-    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef void                    value_type;             ///< The type of the element the iterator can point to
-    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef DiscardOutputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef void value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef void pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef void reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::any_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -97,20 +110,20 @@ private:
 #endif
 
 public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ DiscardOutputIterator(
-        OffsetT offset = 0)     ///< Base offset
-    :
-        offset(offset)
+    /**
+     * @param offset
+     *   Base offset
+     */
+    __host__ __device__ __forceinline__ DiscardOutputIterator(OffsetT offset = 0)
+        : offset(offset)
     {}
 
     /// Postfix increment
     __host__ __device__ __forceinline__ self_type operator++(int)
     {
-        self_type retval = *this;
-        offset++;
-        return retval;
+      self_type retval = *this;
+      offset++;
+      return retval;
     }
 
     /// Prefix increment
diff --git a/cub/cub/iterator/tex_obj_input_iterator.cuh b/cub/cub/iterator/tex_obj_input_iterator.cuh
index 24cdd165a34..df86618e849 100644
--- a/cub/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/cub/iterator/tex_obj_input_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -60,16 +60,17 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 
 
 /**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ * @brief A random-access input wrapper for dereferencing array values through texture cache.  
+ *        Uses newer Kepler-style texture objects.
  *
- * \par Overview
+ * @par Overview
  * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
  *   to elements are to be loaded through texture cache.
  * - Can be used to load any data type from memory through texture cache.
@@ -80,11 +81,11 @@ CUB_NAMESPACE_BEGIN
  *   created by the host thread, but can be used by any descendant kernel.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexObjInputIterator to
+ * @par Snippet
+ * The code snippet below illustrates the use of @p TexObjInputIterator to
  * dereference a device array of doubles through texture cache.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
  *
  * // Declare, allocate, and initialize a device array
@@ -104,10 +105,13 @@ CUB_NAMESPACE_BEGIN
  * ...
  * itr.UnbindTexture();
  *
- * \endcode
+ * @endcode
  *
- * \tparam T                    The value type of this iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam T                    
+ *   The value type of this iterator
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     typename    T,
@@ -117,22 +121,35 @@ class TexObjInputIterator
 public:
 
     // Required iterator traits
-    typedef TexObjInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef TexObjInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef T value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef T *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef T reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::device_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -161,12 +178,20 @@ public:
         tex_obj(0)
     {}
 
-    /// Use this iterator to bind \p ptr with a texture reference
+    /**
+     * @brief Use this iterator to bind @p ptr with a texture reference
+     *
+     * @param ptr
+     *   Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+     *
+     * @param bytes
+     *   Number of bytes in the range
+     *
+     * @param tex_offset
+     *   OffsetT (in items) from @p ptr denoting the position of the iterator
+     */
     template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,              ///< Number of bytes in the range
-        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    cudaError_t BindTexture(QualifiedT *ptr, size_t bytes, size_t tex_offset = 0)
     {
         this->ptr = const_cast<typename std::remove_cv<QualifiedT>::type *>(ptr);
         this->tex_offset = static_cast<difference_type>(tex_offset);
diff --git a/cub/cub/iterator/tex_ref_input_iterator.cuh b/cub/cub/iterator/tex_ref_input_iterator.cuh
index 67217a0f803..5a91fd2874f 100644
--- a/cub/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/cub/iterator/tex_ref_input_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -48,37 +48,37 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 /**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.
+ * @brief A random-access input wrapper for dereferencing array values through texture cache.
  *
- * \deprecated [Since 1.13.0] The CUDA texture management APIs used by
+ * @deprecated [Since 1.13.0] The CUDA texture management APIs used by
  * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead.
  *
- * \par Overview
+ * @par Overview
  * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
  *   to elements are to be loaded through texture cache.
  * - Can be used to load any data type from memory through texture cache.
  * - Can be manipulated and exchanged within and between host and device
  *   functions, can only be constructed within host functions, and can only be
  *   dereferenced within device functions.
- * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ * - The @p UNIQUE_ID template parameter is used to statically name the underlying texture
  *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
- *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   specific combination of (1) data type @p T, (2) @p UNIQUE_ID, (3) host
  *   thread, and (4) compilation .o unit.
  * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
  *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
  *   from the host).
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIterator to
+ * @par Snippet
+ * The code snippet below illustrates the use of @p TexRefInputIterator to
  * dereference a device array of doubles through texture cache.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
  *
  * // Declare, allocate, and initialize a device array
@@ -98,11 +98,16 @@ CUB_NAMESPACE_BEGIN
  * ...
  * itr.UnbindTexture();
  *
- * \endcode
+ * @endcode
  *
- * \tparam T                    The value type of this iterator
- * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam T                    
+ *   The value type of this iterator
+ *
+ * @tparam UNIQUE_ID            
+ *   A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     typename    T,
diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh
index de69a64ae76..f0396f53081 100644
--- a/cub/cub/iterator/transform_input_iterator.cuh
+++ b/cub/cub/iterator/transform_input_iterator.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Random-access iterator types
  */
 
@@ -57,29 +57,29 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIterator
+ * @addtogroup UtilIterator
  * @{
  */
 
 
 /**
- * \brief A random-access input wrapper for transforming dereferenced values.
+ * @brief A random-access input wrapper for transforming dereferenced values.
  *
- * \par Overview
- * - TransformInputIteratorTwraps a unary conversion functor of type \p
- *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
- *   using the former to produce references of type \p ValueType from the latter.
+ * @par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type 
+ *   @p ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type @p ValueType from the latter.
  * - Can be used with any data type.
  * - Can be constructed, manipulated, and exchanged within and between host and device
  *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
  *   device memory can only be dereferenced on the device.
  * - Compatible with Thrust API v1.7 or newer.
  *
- * \par Snippet
- * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * @par Snippet
+ * The code snippet below illustrates the use of @p TransformInputIteratorTto
  * dereference an array of integers, tripling the values and converting them to doubles.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
  *
  * // Functor for tripling integer values and converting to doubles
@@ -103,13 +103,20 @@ CUB_NAMESPACE_BEGIN
  * printf("%f\n", itr[1]);  // 18.0
  * printf("%f\n", itr[6]);  // 27.0
  *
- * \endcode
+ * @endcode
  *
- * \tparam ValueType            The value type of this iterator
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
- * \tparam InputIteratorT       The type of the wrapped input iterator
- * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * @tparam ValueType            
+ *   The value type of this iterator
  *
+ * @tparam ConversionOp         
+ *   Unary functor type for mapping objects of type @p InputType to type @p ValueType.  
+ *   Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ *
+ * @tparam InputIteratorT       
+ *   The type of the wrapped input iterator
+ *
+ * @tparam OffsetT              
+ *   The difference type of this iterator (Default: @p ptrdiff_t)
  */
 template <
     typename ValueType,
@@ -121,22 +128,35 @@ class TransformInputIterator
 public:
 
     // Required iterator traits
-    typedef TransformInputIterator              self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+    /// My own type
+    typedef TransformInputIterator self_type;
+
+    /// Type to express the result of subtracting one iterator from another
+    typedef OffsetT difference_type;
+
+    /// The type of the element the iterator can point to
+    typedef ValueType value_type;
+
+    /// The type of a pointer to an element the iterator can point to
+    typedef ValueType *pointer;
+
+    /// The type of a reference to an element the iterator can point to
+    typedef ValueType reference;
 
 #if (THRUST_VERSION >= 100700)
     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+
+    /// The iterator category
     typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
         THRUST_NS_QUALIFIER::any_system_tag,
         THRUST_NS_QUALIFIER::random_access_traversal_tag,
         value_type,
         reference
-      >::type iterator_category;                                        ///< The iterator category
+      >::type iterator_category;                                        
 #else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+    /// The iterator category
+    typedef std::random_access_iterator_tag     iterator_category;      
 #endif  // THRUST_VERSION
 
 private:
@@ -145,22 +165,25 @@ private:
     InputIteratorT  input_itr;
 
 public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TransformInputIterator(
-        InputIteratorT      input_itr,          ///< Input iterator to wrap
-        ConversionOp        conversion_op)      ///< Conversion functor to wrap
-    :
-        conversion_op(conversion_op),
-        input_itr(input_itr)
+    /**
+     * @param input_itr
+     *   Input iterator to wrap
+     *
+     * @param conversion_op
+     *   Conversion functor to wrap
+     */
+    __host__ __device__ __forceinline__ TransformInputIterator(InputIteratorT input_itr,
+                                                               ConversionOp conversion_op)
+        : conversion_op(conversion_op)
+        , input_itr(input_itr)
     {}
 
     /// Postfix increment
     __host__ __device__ __forceinline__ self_type operator++(int)
     {
-        self_type retval = *this;
-        input_itr++;
-        return retval;
+      self_type retval = *this;
+      input_itr++;
+      return retval;
     }
 
     /// Prefix increment
diff --git a/cub/cub/thread/thread_load.cuh b/cub/cub/thread/thread_load.cuh
index fc83bbe8c3a..8b4eafb6196 100644
--- a/cub/cub/thread/thread_load.cuh
+++ b/cub/cub/thread/thread_load.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Thread utilities for reading memory using PTX cache modifiers.
  */
 
@@ -49,7 +49,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIo
+ * @addtogroup UtilIo
  * @{
  */
 
@@ -58,30 +58,30 @@ CUB_NAMESPACE_BEGIN
 //-----------------------------------------------------------------------------
 
 /**
- * \brief Enumeration of cache modifiers for memory load operations.
+ * @brief Enumeration of cache modifiers for memory load operations.
  */
 enum CacheLoadModifier
 {
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
+  LOAD_DEFAULT,  ///< Default (no modifier)
+  LOAD_CA,       ///< Cache at all levels
+  LOAD_CG,       ///< Cache at global level
+  LOAD_CS,       ///< Cache streaming (likely to be accessed once)
+  LOAD_CV,       ///< Cache as volatile (including cached system lines)
+  LOAD_LDG,      ///< Cache as texture
+  LOAD_VOLATILE, ///< Volatile (any memory space)
 };
 
-
 /**
- * \name Thread I/O (cache modified)
+ * @name Thread I/O (cache modified)
  * @{
  */
 
 /**
- * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ * @brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  
+ *        Can be used to load any data type.
  *
- * \par Example
- * \code
+ * @par Example
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
  *
  * // 32-bit load using cache-global modifier:
@@ -102,8 +102,11 @@ enum CacheLoadModifier
  * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
  * \endcode
  *
- * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ * @tparam MODIFIER             
+ *   <b>[inferred]</b> CacheLoadModifier enumeration
+ *
+ * @tparam InputIteratorT       
+ *   <b>[inferred]</b> Input iterator type \iterator
  */
 template <CacheLoadModifier MODIFIER,
           typename InputIteratorT>
@@ -339,7 +342,8 @@ __device__ __forceinline__ T ThreadLoadVolatilePointer(
     T                       *ptr,
     Int2Type<false>         /*is_primitive*/)
 {
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+    // Word type for memcopying
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   
 
     constexpr int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
 
diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
index 8384b0cc401..5e1ba043fd9 100644
--- a/cub/cub/thread/thread_reduce.cuh
+++ b/cub/cub/thread/thread_reduce.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Thread utilities for sequential reduction over statically-sized array types
  */
 
@@ -50,19 +50,24 @@ CUB_NAMESPACE_BEGIN
 namespace internal {
 
 /**
- * Sequential reduction over statically-sized array types
+ * @brief Sequential reduction over statically-sized array types
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp,
-    typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-__device__ __forceinline__ AccumT ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    PrefixT             prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<LENGTH>    /*length*/)
+template <int LENGTH,
+          typename T,
+          typename ReductionOp,
+          typename PrefixT,
+          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
+__device__ __forceinline__ AccumT
+ThreadReduce(T *input, ReductionOp reduction_op, PrefixT prefix, Int2Type<LENGTH> /*length*/)
 {
     AccumT retval = prefix;
 
@@ -73,85 +78,122 @@ __device__ __forceinline__ AccumT ThreadReduce(
     return retval;
 }
 
-
 /**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array,
+ *        seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of input array
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced.
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
  *
- * \tparam LENGTH       LengthT of input array
- * \tparam T            <b>[inferred]</b> The data type to be reduced.
- * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] input
+ *   Input array
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp,
-    typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-__device__ __forceinline__ AccumT ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    PrefixT     prefix)                 ///< [in] Prefix to seed reduction with
+template <int LENGTH,
+          typename T,
+          typename ReductionOp,
+          typename PrefixT,
+          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
+__device__ __forceinline__ AccumT ThreadReduce(T *input, ReductionOp reduction_op, PrefixT prefix)
 {
     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 }
 
-
 /**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ * @brief Perform a sequential reduction over @p LENGTH elements of the @p input array.
+ *        The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of input array
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced.
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
  *
- * \tparam LENGTH       LengthT of input array
- * \tparam T            <b>[inferred]</b> The data type to be reduced.
- * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] input
+ *   Input array
+ *
+ * @param[in] reduction_op
+ *   Binary reduction operator
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+template <int LENGTH, typename T, typename ReductionOp>
+__device__ __forceinline__ T ThreadReduce(T *input, ReductionOp reduction_op)
 {
     T prefix = input[0];
     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
 }
 
-
 /**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential reduction over the statically-sized @p input array,
+ *        seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input array
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced.
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
  *
- * \tparam LENGTH       <b>[inferred]</b> LengthT of \p input array
- * \tparam T            <b>[inferred]</b> The data type to be reduced.
- * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] reduction_op
+ *   Binary reduction operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed reduction with
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp,
-    typename    PrefixT,
-    typename    AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
-__device__ __forceinline__ AccumT ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    PrefixT     prefix)                 ///< [in] Prefix to seed reduction with
+template <int LENGTH,
+          typename T,
+          typename ReductionOp,
+          typename PrefixT,
+          typename AccumT = detail::accumulator_t<ReductionOp, PrefixT, T>>
+__device__ __forceinline__ AccumT ThreadReduce(T (&input)[LENGTH],
+                                               ReductionOp reduction_op,
+                                               PrefixT prefix)
 {
     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
 }
 
-
 /**
- * \brief Serial reduction with the specified operator
+ * @brief Serial reduction with the specified operator
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input array
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be reduced.
+ *
+ * @tparam ReductionOp
+ *   <b>[inferred]</b> Binary reduction operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
  *
- * \tparam LENGTH       <b>[inferred]</b> LengthT of \p input array
- * \tparam T            <b>[inferred]</b> The data type to be reduced.
- * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] reduction_op
+ *   Binary reduction operator
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+template <int LENGTH, typename T, typename ReductionOp>
+__device__ __forceinline__ T ThreadReduce(T (&input)[LENGTH], ReductionOp reduction_op)
 {
     return ThreadReduce<LENGTH>((T*) input, reduction_op);
 }
diff --git a/cub/cub/thread/thread_scan.cuh b/cub/cub/thread/thread_scan.cuh
index bc3840ec9c8..c49777440db 100644
--- a/cub/cub/thread/thread_scan.cuh
+++ b/cub/cub/thread/thread_scan.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Thread utilities for sequential prefix scan over statically-sized array types
  */
 
@@ -50,26 +50,32 @@ namespace internal {
 
 
 /**
- * \addtogroup UtilModule
+ * @addtogroup UtilModule
  * @{
  */
 
 /**
- * \name Sequential prefix scan over statically-sized array types
+ * @name Sequential prefix scan over statically-sized array types
  * @{
  */
 
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
+/**
+ * @param[in] input 
+ *   Input array
+ *
+ * @param[out] output 
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op 
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(T inclusive,
+                                                 T exclusive,
+                                                 T *input,
+                                                 T *output,
+                                                 ScanOp scan_op,
+                                                 Int2Type<LENGTH> /*length*/)
 {
     #pragma unroll
     for (int i = 0; i < LENGTH; ++i)
@@ -82,25 +88,40 @@ __device__ __forceinline__ T ThreadScanExclusive(
     return inclusive;
 }
 
-
-
 /**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential exclusive prefix scan over @p LENGTH elements of
+ *        the @p input array, seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
  *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   If not, the first output element is undefined.
+ *   (Handy for preventing thread-0 from applying a prefix.)
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T
+ThreadScanExclusive(T *input, T *output, ScanOp scan_op, T prefix, bool apply_prefix = true)
 {
     T inclusive = input[0];
     if (apply_prefix)
@@ -113,46 +134,59 @@ __device__ __forceinline__ T ThreadScanExclusive(
     return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
 }
 
-
 /**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential exclusive prefix scan over the statically-sized
+ *        @p input array, seeded with the specified @p prefix. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
  *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(T (&input)[LENGTH],
+                                                 T (&output)[LENGTH],
+                                                 ScanOp scan_op,
+                                                 T prefix,
+                                                 bool apply_prefix = true)
 {
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+    return ThreadScanExclusive<LENGTH>((T *)input, (T *)output, scan_op, prefix, apply_prefix);
 }
 
-
-
-
-
-
-
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    /*length*/)
+/**
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ */
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T
+ThreadScanInclusive(T inclusive, T *input, T *output, ScanOp scan_op, Int2Type<LENGTH> /*length*/)
 {
     #pragma unroll
     for (int i = 0; i < LENGTH; ++i)
@@ -164,22 +198,31 @@ __device__ __forceinline__ T ThreadScanInclusive(
     return inclusive;
 }
 
-
 /**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ * @brief Perform a sequential inclusive prefix scan over
+ *        @p LENGTH elements of the @p input array. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
  *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] scan_op
+ *   Binary scan operator
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(T *input, T *output, ScanOp scan_op)
 {
     T inclusive = input[0];
     output[0] = inclusive;
@@ -188,44 +231,71 @@ __device__ __forceinline__ T ThreadScanInclusive(
     return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
 }
 
-
 /**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ * @brief Perform a sequential inclusive prefix scan over the
+ *        statically-sized @p input array. The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
  *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH],
+                                                 T (&output)[LENGTH],
+                                                 ScanOp scan_op)
 {
     return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
 }
 
-
 /**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential inclusive prefix scan over
+ *        @p LENGTH elements of the @p input array, seeded with the
+ *        specified @p prefix. The aggregate is returned.
  *
- * \tparam LENGTH     LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @tparam LENGTH
+ *   LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
+ *
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T
+ThreadScanInclusive(T *input, T *output, ScanOp scan_op, T prefix, bool apply_prefix = true)
 {
     T inclusive = input[0];
     if (apply_prefix)
@@ -238,24 +308,43 @@ __device__ __forceinline__ T ThreadScanInclusive(
     return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
 }
 
-
 /**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ * @brief Perform a sequential inclusive prefix scan over the
+ *        statically-sized @p input array, seeded with the specified @p prefix.
+ *        The aggregate is returned.
+ *
+ * @tparam LENGTH
+ *   <b>[inferred]</b> LengthT of @p input and @p output arrays
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The data type to be scanned.
+ *
+ * @tparam ScanOp
+ *   <b>[inferred]</b> Binary scan operator type having member
+ *   <tt>T operator()(const T &a, const T &b)</tt>
+ *
+ * @param[in] input
+ *   Input array
+ *
+ * @param[out] output
+ *   Output array (may be aliased to @p input)
+ *
+ * @param[in] scan_op
+ *   Binary scan operator
+ *
+ * @param[in] prefix
+ *   Prefix to seed scan with
  *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * @param[in] apply_prefix
+ *   Whether or not the calling thread should apply its prefix.
+ *   (Handy for preventing thread-0 from applying a prefix.)
  */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+template <int LENGTH, typename T, typename ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(T (&input)[LENGTH],
+                                                 T (&output)[LENGTH],
+                                                 ScanOp scan_op,
+                                                 T prefix,
+                                                 bool apply_prefix = true)
 {
     return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
 }
diff --git a/cub/cub/thread/thread_search.cuh b/cub/cub/thread/thread_search.cuh
index e18caf09a09..8d259aadfbe 100644
--- a/cub/cub/thread/thread_search.cuh
+++ b/cub/cub/thread/thread_search.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Thread utilities for sequential search
  */
 
@@ -91,19 +91,21 @@ __host__ __device__ __forceinline__ void MergePathSearch(
     path_coordinate.y = diagonal - split_min;
 }
 
-
-
 /**
- * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ * @brief Returns the offset of the first value within @p input which does not compare
+ *        less than @p val
+ *
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
  */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT LowerBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
+template <typename InputIteratorT, typename OffsetT, typename T>
+__device__ __forceinline__ OffsetT LowerBound(InputIteratorT input, OffsetT num_items, T val)
 {
     OffsetT retval = 0;
     while (num_items > 0)
@@ -123,18 +125,21 @@ __device__ __forceinline__ OffsetT LowerBound(
     return retval;
 }
 
-
 /**
- * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ * @brief Returns the offset of the first value within @p input which compares
+ *        greater than @p val
+ *
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
  */
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-__device__ __forceinline__ OffsetT UpperBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
+template <typename InputIteratorT, typename OffsetT, typename T>
+__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, T val)
 {
     OffsetT retval = 0;
     while (num_items > 0)
@@ -156,13 +161,18 @@ __device__ __forceinline__ OffsetT UpperBound(
 
 
 #if defined(__CUDA_FP16_TYPES_EXIST__)
-template <
-    typename InputIteratorT,
-    typename OffsetT>
-__device__ __forceinline__ OffsetT UpperBound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    __half              val)                ///< [in] Search key
+/**
+ * @param[in] input
+ *   Input sequence
+ *
+ * @param[in] num_items
+ *   Input sequence length
+ *
+ * @param[in] val
+ *   Search key
+ */
+template <typename InputIteratorT, typename OffsetT>
+__device__ __forceinline__ OffsetT UpperBound(InputIteratorT input, OffsetT num_items, __half val)
 {
     OffsetT retval = 0;
     while (num_items > 0)
diff --git a/cub/cub/thread/thread_store.cuh b/cub/cub/thread/thread_store.cuh
index 9d24aa54ad9..bb2b4675827 100644
--- a/cub/cub/thread/thread_store.cuh
+++ b/cub/cub/thread/thread_store.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * Thread utilities for writing memory using PTX cache modifiers.
  */
 
@@ -47,7 +47,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilIo
+ * @addtogroup UtilIo
  * @{
  */
 
@@ -57,29 +57,29 @@ CUB_NAMESPACE_BEGIN
 //-----------------------------------------------------------------------------
 
 /**
- * \brief Enumeration of cache modifiers for memory store operations.
+ * @brief Enumeration of cache modifiers for memory store operations.
  */
 enum CacheStoreModifier
 {
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+  STORE_DEFAULT,  ///< Default (no modifier)
+  STORE_WB,       ///< Cache write-back all coherent levels
+  STORE_CG,       ///< Cache at global level
+  STORE_CS,       ///< Cache streaming (likely to be accessed once)
+  STORE_WT,       ///< Cache write-through (to system memory)
+  STORE_VOLATILE, ///< Volatile shared (any memory space)
 };
 
-
 /**
- * \name Thread I/O (cache modified)
+ * @name Thread I/O (cache modified)
  * @{
  */
 
 /**
- * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ * @brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.
+ *        Can be used to store any data type.
  *
- * \par Example
- * \code
+ * @par Example
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
  *
  * // 32-bit store using cache-global modifier:
@@ -102,11 +102,16 @@ enum CacheStoreModifier
  * TestFoo *d_struct;
  * TestFoo val;
  * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
+ * @endcode
+ *
+ * @tparam MODIFIER
+ *   <b>[inferred]</b> CacheStoreModifier enumeration
+ *
+ * @tparam InputIteratorT
+ *   <b>[inferred]</b> Output iterator type \iterator
  *
- * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
- * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
- * \tparam T                    <b>[inferred]</b> Data type of output value
+ * @tparam T
+ *   <b>[inferred]</b> Data type of output value
  */
 template <
     CacheStoreModifier  MODIFIER,
diff --git a/cub/cub/util_allocator.cuh b/cub/cub/util_allocator.cuh
index 8172dd919f8..8e3978f18ce 100644
--- a/cub/cub/util_allocator.cuh
+++ b/cub/cub/util_allocator.cuh
@@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN
 
 
 /**
- * \addtogroup UtilMgmt
+ * @addtogroup UtilMgmt
  * @{
  */
 
@@ -64,40 +64,40 @@ CUB_NAMESPACE_BEGIN
  ******************************************************************************/
 
 /**
- * \brief A simple caching allocator for device memory allocations.
+ * @brief A simple caching allocator for device memory allocations.
  *
- * \par Overview
+ * @par Overview
  * The allocator is thread-safe and stream-safe and is capable of managing cached
  * device allocations on multiple devices.  It behaves as follows:
  *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
+ * @par
+ * - Allocations from the allocator are associated with an @p active_stream. Once freed,
+ *   the allocation becomes available immediately for reuse within the @p active_stream
  *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   reuse within other streams when all prior work submitted to @p active_stream has completed.
+ * - Allocations are categorized and cached by bin size. A new allocation request of
  *   a given size will only consider cached allocations within the corresponding bin.
  * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
+ *   @p bin_growth provided during construction. Unused device allocations within
  *   a larger bin cache are not reused for allocation requests that categorize to
  *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ * - Allocation requests below ( @p bin_growth ^ @p min_bin ) are rounded up to
+ *   ( @p bin_growth ^ @p min_bin ).
+ * - Allocations above ( @p bin_growth ^ @p max_bin ) are not rounded up to the nearest
  *   bin and are simply freed when they are deallocated instead of being returned
  *   to a bin-cache.
  * - If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   @p max_cached_bytes, allocations for that device are simply freed when they are
  *   deallocated instead of being returned to their bin-cache.
  *
- * \par
+ * @par
  * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth          = 8
- * - \p min_bin             = 3
- * - \p max_bin             = 7
- * - \p max_cached_bytes    = 6MB - 1B
+ * - @p bin_growth          = 8
+ * - @p min_bin             = 3
+ * - @p max_bin             = 7
+ * - @p max_cached_bytes    = 6MB - 1B
  *
- * \par
+ * @par
  * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
  * and sets a maximum of 6,291,455 cached bytes per device
  *
@@ -129,40 +129,52 @@ struct CachingDeviceAllocator
      */
     struct BlockDescriptor
     {
-        void*           d_ptr;              // Device pointer
-        size_t          bytes;              // Size of allocation in bytes
-        unsigned int    bin;                // Bin enumeration
-        int             device;             // device ordinal
-        cudaStream_t    associated_stream;  // Associated associated_stream
-        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
-
-        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
+        // Device pointer
+        void *d_ptr;
+
+        // Size of allocation in bytes
+        size_t bytes;
+
+        // Bin enumeration
+        unsigned int bin;
+
+        // device ordinal
+        int device;
+
+        // Associated associated_stream
+        cudaStream_t associated_stream;
+
+        // Signal when associated stream has run to the point at which this block was freed
+        cudaEvent_t ready_event;
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and
+        // device)
+        BlockDescriptor(void *d_ptr, int device)
+            : d_ptr(d_ptr)
+            , bytes(0)
+            , bin(INVALID_BIN)
+            , device(device)
+            , associated_stream(0)
+            , ready_event(0)
         {}
 
         // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
-        BlockDescriptor(int device) :
-            d_ptr(NULL),
-            bytes(0),
-            bin(INVALID_BIN),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
+        BlockDescriptor(int device)
+            : d_ptr(NULL)
+            , bytes(0)
+            , bin(INVALID_BIN)
+            , device(device)
+            , associated_stream(0)
+            , ready_event(0)
         {}
 
         // Comparison functor for comparing device pointers
         static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
         {
-            if (a.device == b.device)
-                return (a.d_ptr < b.d_ptr);
-            else
-                return (a.device < b.device);
+          if (a.device == b.device)
+            return (a.d_ptr < b.d_ptr);
+          else
+            return (a.device < b.device);
         }
 
         // Comparison functor for comparing allocation sizes
@@ -246,27 +258,46 @@ struct CachingDeviceAllocator
         }
     }
 
-
     //---------------------------------------------------------------------
     // Fields
     //---------------------------------------------------------------------
 
-    std::mutex      mutex;              /// Mutex for thread-safety
+    /// Mutex for thread-safety
+    std::mutex mutex;
+
+    /// Geometric growth factor for bin-sizes
+    unsigned int bin_growth;
 
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
+    /// Minimum bin enumeration
+    unsigned int min_bin;
 
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+    /// Maximum bin enumeration
+    unsigned int max_bin;
 
-    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+    /// Minimum bin size
+    size_t min_bin_bytes;
 
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+    /// Maximum bin size
+    size_t max_bin_bytes;
+
+    /// Maximum aggregate cached bytes per device
+    size_t max_cached_bytes;
+
+    /// Whether or not to skip a call to FreeAllCached() when destructor is called.
+    /// (The CUDA runtime may have already shut down for statically declared allocators)
+    const bool skip_cleanup;
+
+    /// Whether or not to print (de)allocation events to stdout
+    bool debug;
+
+    /// Map of device ordinal to aggregate cached bytes on that device
+    GpuCachedBytes cached_bytes;
+
+    /// Set of cached device allocations available for reuse
+    CachedBlocks cached_blocks;
+
+    /// Set of live device allocations currently in use
+    BusyBlocks live_blocks;
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
@@ -275,38 +306,55 @@ struct CachingDeviceAllocator
     //---------------------------------------------------------------------
 
     /**
-     * \brief Constructor.
+     * @brief Constructor.
+     *
+     * @param bin_growth
+     *   Geometric growth factor for bin-sizes
+     *
+     * @param min_bin
+     *   Minimum bin (default is bin_growth ^ 1)
+     *
+     * @param max_bin
+     *   Maximum bin (default is no max bin)
+     *
+     * @param max_cached_bytes
+     *   Maximum aggregate cached bytes per device (default is no limit)
+     *
+     * @param skip_cleanup
+     *   Whether or not to skip a call to @p FreeAllCached() when the destructor is called (default
+     *   is to deallocate)
+     *
+     * @param debug
+     *   Whether or not to print (de)allocation events to stdout (default is no stderr output)
      */
-    CachingDeviceAllocator(
-        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
-        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
-        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
-        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
-        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
-        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
-    :
-        bin_growth(bin_growth),
-        min_bin(min_bin),
-        max_bin(max_bin),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes(max_cached_bytes),
-        skip_cleanup(skip_cleanup),
-        debug(debug),
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare)
+    CachingDeviceAllocator(unsigned int bin_growth,
+                           unsigned int min_bin    = 1,
+                           unsigned int max_bin    = INVALID_BIN,
+                           size_t max_cached_bytes = INVALID_SIZE,
+                           bool skip_cleanup       = false,
+                           bool debug              = false)
+        : bin_growth(bin_growth)
+        , min_bin(min_bin)
+        , max_bin(max_bin)
+        , min_bin_bytes(IntPow(bin_growth, min_bin))
+        , max_bin_bytes(IntPow(bin_growth, max_bin))
+        , max_cached_bytes(max_cached_bytes)
+        , skip_cleanup(skip_cleanup)
+        , debug(debug)
+        , cached_blocks(BlockDescriptor::SizeCompare)
+        , live_blocks(BlockDescriptor::PtrCompare)
     {}
 
 
     /**
-     * \brief Default constructor.
+     * @brief Default constructor.
      *
      * Configured with:
-     * \par
-     * - \p bin_growth          = 8
-     * - \p min_bin             = 3
-     * - \p max_bin             = 7
-     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     * @par
+     * - @p bin_growth          = 8
+     * - @p min_bin             = 3
+     * - @p max_bin             = 7
+     * - @p max_cached_bytes    = ( @p bin_growth ^ @p max_bin) * 3 ) - 1 = 6,291,455 bytes
      *
      * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
      * sets a maximum of 6,291,455 cached bytes per device
@@ -329,7 +377,7 @@ struct CachingDeviceAllocator
 
 
     /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     * @brief Sets the limit on the number bytes this allocator is allowed to cache per device.
      *
      * Changing the ceiling of cached bytes does not cause any allocations (in-use or
      * cached-in-reserve) to be freed.  See \p FreeAllCached().
@@ -349,19 +397,29 @@ struct CachingDeviceAllocator
         return cudaSuccess;
     }
 
-
     /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     * @brief Provides a suitable allocation of device memory for the given size on the specified
+     *        device.
      *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
+     * Once freed, the allocation becomes available immediately for reuse within the @p
+     * active_stream with which it was associated with during allocation, and it becomes available
+     * for reuse within other streams when all prior work submitted to @p active_stream has
+     * completed.
+     *
+     * @param[in] device
+     *   Device on which to place the allocation
+     *
+     * @param[out] d_ptr
+     *   Reference to pointer to the allocation
+     *
+     * @param[in] bytes
+     *   Minimum number of bytes for the allocation
+     *
+     * @param[in] active_stream
+     *   The stream to be associated with this allocation
      */
-    cudaError_t DeviceAllocate(
-        int             device,             ///< [in] Device on which to place the allocation
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    cudaError_t
+    DeviceAllocate(int device, void **d_ptr, size_t bytes, cudaStream_t active_stream = 0)
     {
         *d_ptr                          = NULL;
         int entrypoint_device           = INVALID_DEVICE_ORDINAL;
@@ -571,29 +629,37 @@ struct CachingDeviceAllocator
         return error;
     }
 
-
     /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     * @brief Provides a suitable allocation of device memory for the given size on the current
+     *        device.
      *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
+     * Once freed, the allocation becomes available immediately for reuse within the @p
+     * active_stream with which it was associated with during allocation, and it becomes available
+     * for reuse within other streams when all prior work submitted to @p active_stream has
+     * completed.
+     *
+     * @param[out] d_ptr
+     *   Reference to pointer to the allocation
+     *
+     * @param[in] bytes
+     *   Minimum number of bytes for the allocation
+     *
+     * @param[in] active_stream
+     *   The stream to be associated with this allocation
      */
-    cudaError_t DeviceAllocate(
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    cudaError_t DeviceAllocate(void **d_ptr, size_t bytes, cudaStream_t active_stream = 0)
     {
         return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
     }
 
-
     /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     * @brief Frees a live allocation of device memory on the specified device, returning it to the
+     *        allocator.
      *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
+     * Once freed, the allocation becomes available immediately for reuse within the
+     * @p active_stream with which it was associated with during allocation, and it becomes
+     * available for reuse within other streams when all prior work submitted to @p active_stream
+     * has completed.
      */
     cudaError_t DeviceFree(
         int             device,
@@ -701,13 +767,14 @@ struct CachingDeviceAllocator
         return error;
     }
 
-
     /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     * @brief Frees a live allocation of device memory on the current device, returning it to the
+     *        allocator.
      *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
+     * Once freed, the allocation becomes available immediately for reuse within the @p
+     * active_stream with which it was associated with during allocation, and it becomes available
+     * for reuse within other streams when all prior work submitted to @p active_stream has
+     * completed.
      */
     cudaError_t DeviceFree(
         void*           d_ptr)
@@ -717,7 +784,7 @@ struct CachingDeviceAllocator
 
 
     /**
-     * \brief Frees all cached device allocations on all devices
+     * @brief Frees all cached device allocations on all devices
      */
     cudaError_t FreeAllCached()
     {
@@ -793,7 +860,7 @@ struct CachingDeviceAllocator
 
 
     /**
-     * \brief Destructor
+     * @brief Destructor
      */
     virtual ~CachingDeviceAllocator()
     {
diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh
index d8bb40cdcc4..37df1a5929c 100644
--- a/cub/cub/util_device.cuh
+++ b/cub/cub/util_device.cuh
@@ -566,12 +566,13 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
 } // namespace detail
 
 /**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ * @brief Computes maximum SM occupancy in thread blocks for executing the given kernel function
+ *        pointer @p kernel_ptr on the current device with @p block_threads per thread block.
  *
- * \par Snippet
+ * @par Snippet
  * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
  *
  * template <typename T>
@@ -593,16 +594,25 @@ CUB_RUNTIME_FUNCTION inline cudaError_t HasUVA(bool& has_uva)
  * // max_sm_occupancy  <-- 8 on SM20
  * // max_sm_occupancy  <-- 12 on SM35
  *
- * \endcode
+ * @endcode
  *
+ * @param[out] max_sm_occupancy
+ *   maximum number of thread blocks that can reside on a single SM
+ *
+ * @param[in] kernel_ptr
+ *   Kernel pointer for which to compute SM occupancy
+ *
+ * @param[in] block_threads
+ *   Number of threads per thread block
+ *
+ * @param[in] dynamic_smem_bytes
+ *   Dynamically allocated shared memory in bytes. Default is 0.
  */
 template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION inline
-cudaError_t MaxSmOccupancy(
-    int&                max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads,              ///< [in] Number of threads per thread block
-    int                 dynamic_smem_bytes = 0)	    ///< [in] Dynamically allocated shared memory in bytes. Default is 0.
+CUB_RUNTIME_FUNCTION inline cudaError_t MaxSmOccupancy(int &max_sm_occupancy,
+                                                       KernelPtr kernel_ptr,
+                                                       int block_threads,
+                                                       int dynamic_smem_bytes = 0)
 {
     return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_sm_occupancy,
diff --git a/cub/cub/util_ptx.cuh b/cub/cub/util_ptx.cuh
index 4c5d01e5e54..5398adf50b4 100644
--- a/cub/cub/util_ptx.cuh
+++ b/cub/cub/util_ptx.cuh
@@ -514,24 +514,29 @@ __device__ __forceinline__ unsigned int LaneMaskGe()
 
 /** @} */       // end group UtilPtx
 
-
-
-
 /**
- * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
- * \ingroup WarpModule
+ * @brief Shuffle-up for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by
+ *        <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.
+ *        For thread lanes @e i < src_offset, the thread's own @p input is returned to the thread.
+ *        ![](shfl_up_logo.png)
  *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
+ * @ingroup WarpModule
  *
- * \par
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp. Must be a power-of-two <= 32.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
+ *
+ * @par
  * - Available only for SM3.0 or newer
  *
- * \par Snippet
+ * @par Snippet
  * The code snippet below illustrates each thread obtaining a \p double value from the
  * predecessor of its predecessor.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -542,20 +547,27 @@ __device__ __forceinline__ unsigned int LaneMaskGe()
  *     // Obtain item from two ranks below
  *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
  *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>. The corresponding output @p peer_data will be
+ * <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
  *
+ * @param[in] src_offset
+ *   The relative down-offset of the peer to read from
+ *
+ * @param[in] first_thread
+ *   Index of first lane in logical warp (typically 0)
+ *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
  */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
-    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+template <int LOGICAL_WARP_THREADS, typename T>
+__device__ __forceinline__ T
+ShuffleUp(T input, int src_offset, int first_thread, unsigned int member_mask)
 {
     /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
     enum {
@@ -584,22 +596,29 @@ __device__ __forceinline__ T ShuffleUp(
     return output;
 }
 
-
 /**
- * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
- * \ingroup WarpModule
+ * @brief Shuffle-down for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input contributed by
+ *        <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.
+ *        For thread lanes @e i >= WARP_THREADS, the thread's own @p input is returned to the
+ *        thread. ![](shfl_down_logo.png)
  *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
+ * @ingroup WarpModule
  *
- * \par
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ *
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
+ *
+ * @par
  * - Available only for SM3.0 or newer
  *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
+ * @par Snippet
+ * The code snippet below illustrates each thread obtaining a @p double value from the
  * successor of its successor.
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -610,20 +629,28 @@ __device__ __forceinline__ T ShuffleUp(
  *     // Obtain item from two ranks below
  *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
  *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output @p peer_data will be
+ * <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
+ *
+ * @param[in] src_offset
+ *   The relative up-offset of the peer to read from
  *
+ * @param[in] last_thread
+ *   Index of last thread in logical warp (typically 31 for a 32-thread warp)
+ *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
  */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleDown(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
-    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
-    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+template <int LOGICAL_WARP_THREADS, typename T>
+__device__ __forceinline__ T
+ShuffleDown(T input, int src_offset, int last_thread, unsigned int member_mask)
 {
     /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
     enum {
@@ -652,25 +679,31 @@ __device__ __forceinline__ T ShuffleDown(
     return output;
 }
 
-
 /**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
- * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
- * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ * @brief Shuffle-broadcast for any data type.
+ *        Each <em>warp-lane<sub>i</sub></em> obtains the value @p input
+ *        contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.
+ *        For @p src_lane < 0 or @p src_lane >= WARP_THREADS,
+ *        then the thread's own @p input is returned to the thread.
+ *        ![](shfl_broadcast_logo.png)
  *
- * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
- * \tparam T                        <b>[inferred]</b> The input/output element type
+ * @tparam LOGICAL_WARP_THREADS
+ *   The number of threads per "logical" warp.  Must be a power-of-two <= 32.
  *
- * \ingroup WarpModule
+ * @tparam T
+ *   <b>[inferred]</b> The input/output element type
  *
- * \par
+ * @ingroup WarpModule
+ *
+ * @par
  * - Available only for SM3.0 or newer
  *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ * @par Snippet
+ * The code snippet below illustrates each thread obtaining a @p double value from
+ * <em>warp-lane</em><sub>0</sub>.
  *
- * \par
- * \code
+ * @par
+ * @code
  * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
  *
  * __global__ void ExampleKernel(...)
@@ -681,19 +714,24 @@ __device__ __forceinline__ T ShuffleDown(
  *     // Obtain item from thread 0
  *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
  *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ * @endcode
+ * @par
+ * Suppose the set of input @p thread_data across the first warp of threads is
+ * <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output @p peer_data will be
+ * <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ * @param[in] input
+ *   The value to broadcast
+ *
+ * @param[in] src_lane
+ *   Which warp lane is to do the broadcasting
  *
+ * @param[in] member_mask
+ *   32-bit mask of participating warp lanes
  */
-template <
-    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    typename T>
-__device__ __forceinline__ T ShuffleIndex(
-    T               input,                  ///< [in] The value to broadcast
-    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
-    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+template <int LOGICAL_WARP_THREADS, typename T>
+__device__ __forceinline__ T ShuffleIndex(T input, int src_lane, unsigned int member_mask)
 {
     /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
     enum {
diff --git a/cub/cub/util_temporary_storage.cuh b/cub/cub/util_temporary_storage.cuh
index 17548600a47..c10830c3457 100644
--- a/cub/cub/util_temporary_storage.cuh
+++ b/cub/cub/util_temporary_storage.cuh
@@ -48,22 +48,36 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \addtogroup UtilMgmt
+ * @addtogroup UtilMgmt
  * @{
  */
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
 
 /**
- * \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of
+ *        storage needed).
+ *
+ * @param[in] d_temp_storage
+ *   Device-accessible allocation of temporary storage.
+ *   When NULL, the required allocation size is written to @p temp_storage_bytes and no work is
+ *   done.
+ *
+ * @param[in,out] temp_storage_bytes
+ *   Size in bytes of @p d_temp_storage allocation
+ *
+ * @param[in,out] allocations
+ *   Pointers to device allocations needed
+ *
+ * @param[in] allocation_sizes
+ *   Sizes in bytes of device allocations needed
  */
 template <int ALLOCATIONS>
-__host__ __device__ __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t& temp_storage_bytes,                 ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+__host__ __device__ __forceinline__ cudaError_t
+AliasTemporaries(void *d_temp_storage,
+                 size_t &temp_storage_bytes,
+                 void *(&allocations)[ALLOCATIONS],
+                 size_t (&allocation_sizes)[ALLOCATIONS])
 {
     constexpr int ALIGN_BYTES   = 256;
     constexpr int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
diff --git a/cub/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
index dd0e82bf0b2..965e0df9035 100644
--- a/cub/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -27,7 +27,7 @@
  ******************************************************************************/
 
 /**
- * \file
+ * @file
  * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
  */
 
@@ -82,16 +82,20 @@ struct reduce_max_exists<T, decltype(__reduce_max_sync(0xFFFFFFFF, T{}))> : ::cu
 
 }
 
-
 /**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ * @brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being reduced
  *
- * LOGICAL_WARP_THREADS must be a power-of-two
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp (must be a power-of-two)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
 struct WarpReduceShfl
 {
     static_assert(PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
@@ -167,12 +171,23 @@ struct WarpReduceShfl
     // Reduction steps
     //---------------------------------------------------------------------
 
-    /// Reduction (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int ReduceStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Reduction (specialized for summation across uint32 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ unsigned int
+    ReduceStep(unsigned int input, cub::Sum /*reduction_op*/, int last_lane, int offset)
     {
         unsigned int output;
         int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
@@ -191,13 +206,23 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction (specialized for summation across fp32 types)
-    __device__ __forceinline__ float ReduceStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Reduction (specialized for summation across fp32 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ float
+    ReduceStep(float input, cub::Sum /*reduction_op*/, int last_lane, int offset)
     {
         float output;
         int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
@@ -216,13 +241,23 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long ReduceStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Reduction (specialized for summation across unsigned long long types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ unsigned long long
+    ReduceStep(unsigned long long input, cub::Sum /*reduction_op*/, int last_lane, int offset)
     {
         unsigned long long output;
         int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
@@ -243,13 +278,23 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction (specialized for summation across long long types)
-    __device__ __forceinline__ long long ReduceStep(
-        long long           input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Reduction (specialized for summation across long long types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ long long
+    ReduceStep(long long input, cub::Sum /*reduction_op*/, int last_lane, int offset)
     {
         long long output;
         int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
@@ -271,13 +316,23 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction (specialized for summation across double types)
-    __device__ __forceinline__ double ReduceStep(
-        double              input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Reduction (specialized for summation across double types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ double
+    ReduceStep(double input, cub::Sum /*reduction_op*/, int last_lane, int offset)
     {
         double output;
         int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
@@ -301,14 +356,28 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    /**
+     * @brief Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across 
+     *        KeyValuePair<KeyT, ValueT> types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
     template <typename ValueT, typename KeyT>
-    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
-        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                         last_lane,          ///< [in] Index of last lane in segment
-        int                                         offset)             ///< [in] Up-offset to pull from
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT>
+    ReduceStep(KeyValuePair<KeyT, ValueT> input,
+               SwizzleScanOp<ReduceByKeyOp<cub::Sum>> /*reduction_op*/,
+               int last_lane,
+               int offset)
     {
         KeyValuePair<KeyT, ValueT> output;
 
@@ -328,15 +397,28 @@ struct WarpReduceShfl
         return output;
     }
 
-
-
-    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    /**
+     * @brief Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across
+     *        KeyValuePair<OffsetT, ValueT> types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
     template <typename ValueT, typename OffsetT>
-    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
-        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
-        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
-        int                                           last_lane,          ///< [in] Index of last lane in segment
-        int                                           offset)             ///< [in] Up-offset to pull from
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT>
+    ReduceStep(KeyValuePair<OffsetT, ValueT> input,
+               SwizzleScanOp<ReduceBySegmentOp<cub::Sum>> /*reduction_op*/,
+               int last_lane,
+               int offset)
     {
         KeyValuePair<OffsetT, ValueT> output;
 
@@ -349,14 +431,24 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction step (generic)
+    /**
+     * @brief Reduction step (generic)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
     template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T                  input,              ///< [in] Calling thread's input item.
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
+    __device__ __forceinline__ _T
+    ReduceStep(_T input, ReductionOp reduction_op, int last_lane, int offset)
     {
         _T output = input;
 
@@ -369,28 +461,59 @@ struct WarpReduceShfl
         return output;
     }
 
-
-    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    /**
+     * @brief Reduction step (specialized for small unsigned integers size 32b or less)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     *
+     * @param[in] is_small_unsigned
+     *   Marker type indicating whether T is a small unsigned integer
+     */
     template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    __device__ __forceinline__ _T ReduceStep(_T input,
+                                             ReductionOp reduction_op,
+                                             int last_lane,
+                                             int offset,
+                                             Int2Type<true> /*is_small_unsigned*/)
     {
         return ReduceStep(input, reduction_op, last_lane, offset);
     }
 
-
-    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    /**
+     * @brief Reduction step (specialized for types other than small unsigned integers size 
+     *        32b or less)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     *
+     * @param[in] is_small_unsigned
+     *   Marker type indicating whether T is a small unsigned integer
+     */
     template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,                  ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
-        int             last_lane,              ///< [in] Index of last lane in segment
-        int             offset,                 ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    __device__ __forceinline__ _T ReduceStep(_T input,
+                                             ReductionOp reduction_op,
+                                             int last_lane,
+                                             int offset,
+                                             Int2Type<false> /*is_small_unsigned*/)
     {
         return ReduceStep(input, reduction_op, last_lane, offset);
     }
@@ -400,36 +523,62 @@ struct WarpReduceShfl
     // Templated reduction iteration
     //---------------------------------------------------------------------
 
+    /**
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     */
     template <typename ReductionOp, int STEP>
-    __device__ __forceinline__ void ReduceStep(
-        T&              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        Int2Type<STEP>  /*step*/)
+    __device__ __forceinline__ void
+    ReduceStep(T &input, ReductionOp reduction_op, int last_lane, Int2Type<STEP> /*step*/)
     {
         input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
 
         ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
     }
 
+    /**
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     *
+     * @param[in] last_lane
+     *   Index of last lane in segment
+     */
     template <typename ReductionOp>
-    __device__ __forceinline__ void ReduceStep(
-        T&              /*input*/,              ///< [in] Calling thread's input item.
-        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
-        int             /*last_lane*/,          ///< [in] Index of last lane in segment
-        Int2Type<STEPS> /*step*/)
+    __device__ __forceinline__ void ReduceStep(T & /*input*/,
+                                               ReductionOp /*reduction_op*/,
+                                               int /*last_lane*/,
+                                               Int2Type<STEPS> /*step*/)
     {}
 
 
     //---------------------------------------------------------------------
     // Reduction operations
     //---------------------------------------------------------------------
+
+    /**
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
     template <typename ReductionOp>
-    __device__ __forceinline__ T ReduceImpl(
-        Int2Type<0>     /* all_lanes_valid */,
-        T               input,                  ///< [in] Calling thread's input
-        int             valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    __device__ __forceinline__ T ReduceImpl(Int2Type<0> /* all_lanes_valid */,
+                                            T input,
+                                            int valid_items,
+                                            ReductionOp reduction_op)
     {
         int last_lane = valid_items - 1;
 
@@ -441,12 +590,21 @@ struct WarpReduceShfl
         return output;
     }
 
+    /**
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
     template <typename ReductionOp>
-    __device__ __forceinline__ T ReduceImpl(
-        Int2Type<1>     /* all_lanes_valid */,
-        T               input,                  ///< [in] Calling thread's input
-        int             /* valid_items */,      ///< [in] Total number of valid items across the logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    __device__ __forceinline__ T ReduceImpl(Int2Type<1> /* all_lanes_valid */,
+                                            T input,
+                                            int /* valid_items */,
+                                            ReductionOp reduction_op)
     {
         int last_lane = LOGICAL_WARP_THREADS - 1;
 
@@ -524,29 +682,45 @@ struct WarpReduceShfl
       return output;
     }
 
-    /// Reduction
-    template <
-        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    /**
+     * @brief Reduction
+     *
+     * @tparam ALL_LANES_VALID
+     *   Whether all lanes in each warp are contributing a valid fold of items
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
+    template <bool ALL_LANES_VALID, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
     {
         return ReduceImpl(
             Int2Type<ALL_LANES_VALID>{}, input, valid_items, reduction_op);
     }
 
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    /**
+     * @brief Segmented reduction
+     *
+     * @tparam HEAD_SEGMENTED
+     *   Whether flags indicate a segment-head or a segment-tail
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] flag
+     *   Whether or not the current lane is a segment head/tail
+     *
+     * @param[in] reduction_op
+     *   Binary reduction operator
+     */
+    template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
     {
         // Get the start flags for each thread in the warp.
         int warp_flags = WARP_BALLOT(flag, member_mask);
diff --git a/cub/cub/warp/specializations/warp_reduce_smem.cuh b/cub/cub/warp/specializations/warp_reduce_smem.cuh
index a2077940c7b..242f35d64da 100644
--- a/cub/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/cub/warp/specializations/warp_reduce_smem.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ * @file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
+ * across a CUDA thread warp.
  */
 
 #pragma once
@@ -49,12 +50,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ * @brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being reduced
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
 struct WarpReduceSmem
 {
     /******************************************************************************
@@ -128,17 +136,23 @@ struct WarpReduceSmem
     //---------------------------------------------------------------------
 
     /**
-     * Reduction step
+     * @brief Reduction step
+     *
+     * @tparam ALL_LANES_VALID
+     *   Whether all lanes in each warp are contributing a valid fold of items
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
      */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp,
-        int                 STEP>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEP>      /*step*/)
+    template <bool ALL_LANES_VALID, typename ReductionOp, int STEP>
+    __device__ __forceinline__ T
+    ReduceStep(T input, int valid_items, ReductionOp reduction_op, Int2Type<STEP> /*step*/)
     {
         constexpr int OFFSET = 1 << STEP;
 
@@ -159,18 +173,24 @@ struct WarpReduceSmem
         return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
     }
 
-
     /**
-     * Reduction step (terminate)
+     * @brief Reduction step (terminate)
+     *
+     * @tparam ALL_LANES_VALID
+     *   Whether all lanes in each warp are contributing a valid fold of items
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
      */
-    template <
-        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                      ///< [in] Calling thread's input
-        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
-        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
-        Int2Type<STEPS>     /*step*/)
+    template <bool ALL_LANES_VALID, typename ReductionOp>
+    __device__ __forceinline__ T
+    ReduceStep(T input, int valid_items, ReductionOp /*reduction_op*/, Int2Type<STEPS> /*step*/)
     {
         return input;
     }
@@ -180,19 +200,27 @@ struct WarpReduceSmem
     // Segmented reduction
     //---------------------------------------------------------------------
 
-
     /**
-     * Ballot-based segmented reduce
+     * @brief Ballot-based segmented reduce
+     *
+     * @tparam HEAD_SEGMENTED
+     *   Whether flags indicate a segment-head or a segment-tail
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] flag
+     *   Whether or not the current lane is a segment head/tail
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
+     *
+     * @param[in] has_ballot
+     *   Marker type for whether the target arch has ballot functionality
      */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+    __device__ __forceinline__ T
+    SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<true> /*has_ballot*/)
     {
         // Get the start flags for each thread in the warp.
         int warp_flags = WARP_BALLOT(flag, member_mask);
@@ -239,19 +267,27 @@ struct WarpReduceSmem
         return input;
     }
 
-
     /**
-     * Smem-based segmented reduce
+     * @brief Smem-based segmented reduce
+     *
+     * @tparam HEAD_SEGMENTED
+     *   Whether flags indicate a segment-head or a segment-tail
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] flag
+     *   Whether or not the current lane is a segment head/tail
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
+     *
+     * @param[in] has_ballot
+     *   Marker type for whether the target arch has ballot functionality
      */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,                  ///< [in] Calling thread's input
-        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,           ///< [in] Reduction operator
-        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+    __device__ __forceinline__ T
+    SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op, Int2Type<false> /*has_ballot*/)
     {
         enum
         {
@@ -331,31 +367,43 @@ struct WarpReduceSmem
      ******************************************************************************/
 
     /**
-     * Reduction
+     * @brief Reduction
+     *
+     * @tparam ALL_LANES_VALID
+     *   Whether all lanes in each warp are contributing a valid fold of items
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] valid_items
+     *   Total number of valid items across the logical warp
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
      */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    template <bool ALL_LANES_VALID, typename ReductionOp>
+    __device__ __forceinline__ T Reduce(T input, int valid_items, ReductionOp reduction_op)
     {
         return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
     }
 
-
     /**
-     * Segmented reduction
+     * @brief Segmented reduction
+     *
+     * @tparam HEAD_SEGMENTED
+     *   Whether flags indicate a segment-head or a segment-tail
+     *
+     * @param[in] input
+     *   Calling thread's input
+     *
+     * @param[in] flag
+     *   Whether or not the current lane is a segment head/tail
+     *
+     * @param[in] reduction_op
+     *   Reduction operator
      */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        FlagT,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    template <bool HEAD_SEGMENTED, typename FlagT, typename ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(T input, FlagT flag, ReductionOp reduction_op)
     {
         return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<true>());
     }
diff --git a/cub/cub/warp/specializations/warp_scan_shfl.cuh b/cub/cub/warp/specializations/warp_scan_shfl.cuh
index 81db566c184..85550412d9f 100644
--- a/cub/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/cub/warp/specializations/warp_scan_shfl.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ * @file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
+ * across a CUDA thread warp.
  */
 
 #pragma once
@@ -48,14 +49,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ * @brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned
+ *        across a CUDA thread warp.
  *
- * LOGICAL_WARP_THREADS must be a power-of-two
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp (must be a power-of-two)
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
 struct WarpScanShfl
 {
     //---------------------------------------------------------------------
@@ -122,12 +128,23 @@ struct WarpScanShfl
     // Inclusive scan steps
     //---------------------------------------------------------------------
 
-    /// Inclusive prefix scan step (specialized for summation across int32 types)
-    __device__ __forceinline__ int InclusiveScanStep(
-        int             input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across int32 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ int
+    InclusiveScanStep(int input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         int output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -146,12 +163,23 @@ struct WarpScanShfl
         return output;
     }
 
-    /// Inclusive prefix scan step (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int InclusiveScanStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across uint32 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ unsigned int
+    InclusiveScanStep(unsigned int input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         unsigned int output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -170,13 +198,23 @@ struct WarpScanShfl
         return output;
     }
 
-
-    /// Inclusive prefix scan step (specialized for summation across fp32 types)
-    __device__ __forceinline__ float InclusiveScanStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across fp32 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ float
+    InclusiveScanStep(float input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         float output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -195,13 +233,23 @@ struct WarpScanShfl
         return output;
     }
 
-
-    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long InclusiveScanStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across unsigned long long types)
+     *
+     * @param[in]  input
+     *   Calling thread's input item
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ unsigned long long
+    InclusiveScanStep(unsigned long long input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         unsigned long long output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -225,13 +273,23 @@ struct WarpScanShfl
         return output;
     }
 
-
-    /// Inclusive prefix scan step (specialized for summation across long long types)
-    __device__ __forceinline__ long long InclusiveScanStep(
-        long long       input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across long long types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ long long
+    InclusiveScanStep(long long input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         long long output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -255,13 +313,23 @@ struct WarpScanShfl
         return output;
     }
 
-
-    /// Inclusive prefix scan step (specialized for summation across fp64 types)
-    __device__ __forceinline__ double InclusiveScanStep(
-        double          input,              ///< [in] Calling thread's input item.
-        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    /**
+     * @brief Inclusive prefix scan step (specialized for summation across fp64 types)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
+    __device__ __forceinline__ double
+    InclusiveScanStep(double input, cub::Sum /*scan_op*/, int first_lane, int offset)
     {
         double output;
         int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
@@ -307,13 +375,24 @@ struct WarpScanShfl
     }
 */
 
-    /// Inclusive prefix scan step (generic)
+    /**
+     * @brief Inclusive prefix scan step (generic)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     */
     template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
+    __device__ __forceinline__ _T
+    InclusiveScanStep(_T input, ScanOpT scan_op, int first_lane, int offset)
     {
         _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
 
@@ -325,28 +404,59 @@ struct WarpScanShfl
         return output;
     }
 
-
-    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    /**
+     * @brief Inclusive prefix scan step (specialized for small integers size 32b or less)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     *
+     * @param[in] is_small_unsigned
+     *   Marker type indicating whether T is a small integer
+     */
     template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    __device__ __forceinline__ _T InclusiveScanStep(_T input,
+                                                    ScanOpT scan_op,
+                                                    int first_lane,
+                                                    int offset,
+                                                    Int2Type<true> /*is_small_unsigned*/)
     {
         return InclusiveScanStep(input, scan_op, first_lane, offset);
     }
 
-
-    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    /**
+     * @brief Inclusive prefix scan step (specialized for types other than small integers size 
+     *        32b or less)
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] first_lane
+     *   Index of first lane in segment
+     *
+     * @param[in] offset
+     *   Up-offset to pull from
+     *
+     * @param[in] is_small_unsigned
+     *   Marker type indicating whether T is a small integer
+     */
     template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOpT          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    __device__ __forceinline__ _T InclusiveScanStep(_T input,
+                                                    ScanOpT scan_op,
+                                                    int first_lane,
+                                                    int offset,
+                                                    Int2Type<false> /*is_small_unsigned*/)
     {
         return InclusiveScanStep(input, scan_op, first_lane, offset);
     }
@@ -360,10 +470,16 @@ struct WarpScanShfl
     // Broadcast
     //---------------------------------------------------------------------
 
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    /**
+     * @brief Broadcast
+     *
+     * @param[in] input
+     *   The value to broadcast
+     *
+     * @param[in] src_lane
+     *   Which warp lane is to do the broadcasting
+     */
+    __device__ __forceinline__ T Broadcast(T input, int src_lane)
     {
         return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
     }
@@ -373,12 +489,20 @@ struct WarpScanShfl
     // Inclusive operations
     //---------------------------------------------------------------------
 
-    /// Inclusive scan
+    /**
+     * @brief Inclusive scan
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename _T, typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        _T              input,              ///< [in] Calling thread's input item.
-        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    __device__ __forceinline__ void InclusiveScan(_T input, _T &inclusive_output, ScanOpT scan_op)
     {
         inclusive_output = input;
 
@@ -399,12 +523,22 @@ struct WarpScanShfl
 
     }
 
-    /// Inclusive scan, specialized for reduce-value-by-key
+    /**
+     * @brief Inclusive scan, specialized for reduce-value-by-key
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename KeyT, typename ValueT, typename ReductionOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
-        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    __device__ __forceinline__ void InclusiveScan(KeyValuePair<KeyT, ValueT> input,
+                                                  KeyValuePair<KeyT, ValueT> &inclusive_output,
+                                                  ReduceByKeyOp<ReductionOpT> scan_op)
     {
         inclusive_output = input;
 
@@ -431,14 +565,24 @@ struct WarpScanShfl
         }
     }
 
-
-    /// Inclusive scan with aggregate
+    /**
+     * @brief Inclusive scan with aggregate
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] warp_aggregate
+     *   Warp-wide aggregate reduction of input items
+     */
     template <typename ScanOpT>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOpT         scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &inclusive_output, ScanOpT scan_op, T &warp_aggregate)
     {
         InclusiveScan(input, inclusive_output, scan_op);
 
@@ -451,20 +595,31 @@ struct WarpScanShfl
     // Get exclusive from inclusive
     //---------------------------------------------------------------------
 
-    /// Update inclusive and exclusive using input and inclusive
+    /**
+     * @brief Update inclusive and exclusive using input and inclusive
+     *
+     * @param[in] input
+     *
+     * @param[out] inclusive
+     *
+     * @param[out] exclusive
+     *
+     * @param[in] scan_op
+     *
+     * @param[in] is_integer
+     */
     template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,          ///< [in]
-        T                       &inclusive,         ///< [in, out]
-        T                       &exclusive,         ///< [out]
-        ScanOpT                 /*scan_op*/,        ///< [in]
-        IsIntegerT              /*is_integer*/)     ///< [in]
+    __device__ __forceinline__ void
+    Update(T /*input*/, T &inclusive, T &exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/)
     {
         // initial value unknown
         exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
     }
 
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    /**
+     * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of
+     *        integer types)
+     */
     __device__ __forceinline__ void Update(
         T                       input,
         T                       &inclusive,
@@ -476,7 +631,10 @@ struct WarpScanShfl
         exclusive = inclusive - input;
     }
 
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    /**
+     * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial
+     *        value
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       /*input*/,
@@ -493,7 +651,10 @@ struct WarpScanShfl
             exclusive = initial_value;
     }
 
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    /**
+     * @brief Update inclusive and exclusive using initial value using input and inclusive
+     *        (specialized for summation of integer types)
+     */
     __device__ __forceinline__ void Update (
         T                       input,
         T                       &inclusive,
@@ -506,8 +667,9 @@ struct WarpScanShfl
         exclusive = inclusive - input;
     }
 
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    /**
+     * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       input,
@@ -521,7 +683,10 @@ struct WarpScanShfl
         Update(input, inclusive, exclusive, scan_op, is_integer);
     }
 
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    /**
+     * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial
+     *        value
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       input,
diff --git a/cub/cub/warp/specializations/warp_scan_smem.cuh b/cub/cub/warp/specializations/warp_scan_smem.cuh
index 8f76b3c6253..fb90fe06992 100644
--- a/cub/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/cub/warp/specializations/warp_scan_smem.cuh
@@ -27,8 +27,9 @@
  ******************************************************************************/
 
 /**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ * @file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
+ * across a CUDA thread warp.
  */
 
 #pragma once
@@ -49,12 +50,19 @@ _CCCL_IMPLICIT_SYSTEM_HEADER
 CUB_NAMESPACE_BEGIN
 
 /**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ * @brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned
+ *        across a CUDA thread warp.
+ *
+ * @tparam T
+ *   Data type being scanned
+ *
+ * @tparam LOGICAL_WARP_THREADS
+ *   Number of threads per logical warp
+ *
+ * @tparam LEGACY_PTX_ARCH
+ *   The PTX compute capability for which to to specialize this collective
  */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         LEGACY_PTX_ARCH = 0>    ///< The PTX compute capability for which to to specialize this collective
+template <typename T, int LOGICAL_WARP_THREADS, int LEGACY_PTX_ARCH = 0>
 struct WarpScanSmem
 {
     /******************************************************************************
@@ -157,13 +165,23 @@ struct WarpScanSmem
         Int2Type<STEPS>         /*step*/)
     {}
 
-
-    /// Inclusive prefix scan (specialized for summation across primitive types)
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        Sum                     scan_op,            ///< [in] Binary scan operator
-        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    /**
+     * @brief Inclusive prefix scan (specialized for summation across primitive types)
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in]
+     *   Marker type indicating whether T is primitive type
+     */
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &output, Sum scan_op, Int2Type<true> /*is_primitive*/)
     {
         T identity = 0;
         ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
@@ -175,14 +193,24 @@ struct WarpScanSmem
         ScanStep<true>(output, scan_op, Int2Type<0>());
     }
 
-
-    /// Inclusive prefix scan
+    /**
+     * @brief Inclusive prefix scan
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[in] is_primitive
+     *   Marker type indicating whether T is primitive type
+     */
     template <typename ScanOp, int IS_PRIMITIVE>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp                  scan_op,            ///< [in] Binary scan operator
-        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &output, ScanOp scan_op, Int2Type<IS_PRIMITIVE> /*is_primitive*/)
     {
         // Iterate scan steps
         output = input;
@@ -198,10 +226,16 @@ struct WarpScanSmem
     // Broadcast
     //---------------------------------------------------------------------
 
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    /**
+     * @brief Broadcast
+     *
+     * @param[in] input
+     *   The value to broadcast
+     *
+     * @param[in] src_lane
+     *   Which warp lane is to do the broadcasting
+     */
+    __device__ __forceinline__ T Broadcast(T input, unsigned int src_lane)
     {
         if (lane_id == src_lane)
         {
@@ -218,24 +252,42 @@ struct WarpScanSmem
     // Inclusive operations
     //---------------------------------------------------------------------
 
-    /// Inclusive scan
+    /**
+     * @brief Inclusive scan
+     *
+     * @param[in] input
+     *   Calling thread's input item.
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
+    __device__ __forceinline__ void InclusiveScan(T input, T &inclusive_output, ScanOp scan_op)
     {
         InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
     }
 
-
-    /// Inclusive scan with aggregate
+    /**
+     * @brief Inclusive scan with aggregate
+     *
+     * @param[in] input
+     *   Calling thread's input item
+     *
+     * @param[out] inclusive_output
+     *   Calling thread's output item. May be aliased with @p input
+     *
+     * @param[in] scan_op
+     *   Binary scan operator
+     *
+     * @param[out] warp_aggregate
+     *   Warp-wide aggregate reduction of input items.
+     */
     template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    __device__ __forceinline__ void
+    InclusiveScan(T input, T &inclusive_output, ScanOp scan_op, T &warp_aggregate)
     {
         InclusiveScan(input, inclusive_output, scan_op);
 
@@ -254,14 +306,22 @@ struct WarpScanSmem
     // Get exclusive from inclusive
     //---------------------------------------------------------------------
 
-    /// Update inclusive and exclusive using input and inclusive
+    /**
+     * @brief Update inclusive and exclusive using input and inclusive
+     *
+     * @param[in] input
+     *
+     * @param[in, out] inclusive
+     *
+     * @param[out] exclusive
+     *
+     * @param[in] scan_op
+     *
+     * @param[in] is_integer
+     */
     template <typename ScanOpT, typename IsIntegerT>
-    __device__ __forceinline__ void Update(
-        T                       /*input*/,      ///< [in]
-        T                       &inclusive,     ///< [in, out]
-        T                       &exclusive,     ///< [out]
-        ScanOpT                 /*scan_op*/,    ///< [in]
-        IsIntegerT              /*is_integer*/) ///< [in]
+    __device__ __forceinline__ void
+    Update(T /*input*/, T &inclusive, T &exclusive, ScanOpT /*scan_op*/, IsIntegerT /*is_integer*/)
     {
         // initial value unknown
         ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
@@ -271,7 +331,10 @@ struct WarpScanSmem
         exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
     }
 
-    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    /**
+     * @brief Update inclusive and exclusive using input and inclusive (specialized for summation of
+     *        integer types)
+     */
     __device__ __forceinline__ void Update(
         T                       input,
         T                       &inclusive,
@@ -283,7 +346,10 @@ struct WarpScanSmem
         exclusive = inclusive - input;
     }
 
-    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    /**
+     * @brief Update inclusive and exclusive using initial value using input, inclusive, and initial
+     *        value
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       /*input*/,
@@ -303,7 +369,10 @@ struct WarpScanSmem
             exclusive = initial_value;
     }
 
-    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    /**
+     * @brief Update inclusive and exclusive using initial value using input and inclusive
+     *        (specialized for summation of integer types)
+     */
     __device__ __forceinline__ void Update (
         T                       input,
         T                       &inclusive,
@@ -316,8 +385,9 @@ struct WarpScanSmem
         exclusive = inclusive - input;
     }
 
-
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    /**
+     * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       /*input*/,
@@ -336,7 +406,10 @@ struct WarpScanSmem
         warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
     }
 
-    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    /**
+     * @brief Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized
+     *        for summation of integer types)
+     */
     __device__ __forceinline__ void Update (
         T                       input,
         T                       &inclusive,
@@ -354,7 +427,10 @@ struct WarpScanSmem
         exclusive = inclusive - input;
     }
 
-    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    /**
+     * @brief Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial
+     *        value
+     */
     template <typename ScanOpT, typename IsIntegerT>
     __device__ __forceinline__ void Update (
         T                       /*input*/,