Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Refactor inline comments
Browse files Browse the repository at this point in the history
gevtushenko committed Oct 21, 2023
1 parent e6375bc commit 97dcbe1
Showing 57 changed files with 9,832 additions and 5,067 deletions.
320 changes: 230 additions & 90 deletions cub/cub/agent/agent_histogram.cuh

Large diffs are not rendered by default.

104 changes: 75 additions & 29 deletions cub/cub/agent/agent_radix_sort_downsweep.cuh
Original file line number Diff line number Diff line change
@@ -63,30 +63,61 @@ CUB_NAMESPACE_BEGIN
******************************************************************************/

/**
* Parameterizable tuning policy type for AgentRadixSortDownsweep
* @brief Parameterizable tuning policy type for AgentRadixSortDownsweep
*
* @tparam NOMINAL_BLOCK_THREADS_4B
* Threads per thread block
*
* @tparam NOMINAL_ITEMS_PER_THREAD_4B
* Items per thread (per tile of input)
*
* @tparam ComputeT
* Dominant compute type
*
* @tparam _LOAD_ALGORITHM
* The BlockLoad algorithm to use
*
* @tparam _LOAD_MODIFIER
* Cache load modifier for reading keys (and values)
*
* @tparam _RANK_ALGORITHM
* The radix ranking algorithm to use
*
* @tparam _SCAN_ALGORITHM
* The block scan algorithm to use
*
* @tparam _RADIX_BITS
* The number of radix bits, i.e., log2(bins)
*/
template <
int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block
int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input)
typename ComputeT, ///< Dominant compute type
BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values)
RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use
BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use
int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
struct AgentRadixSortDownsweepPolicy :
ScalingType
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
BlockLoadAlgorithm _LOAD_ALGORITHM,
CacheLoadModifier _LOAD_MODIFIER,
RadixRankAlgorithm _RANK_ALGORITHM,
BlockScanAlgorithm _SCAN_ALGORITHM,
int _RADIX_BITS,
typename ScalingType =
RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentRadixSortDownsweepPolicy : ScalingType
{
enum
{
RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
};
enum
{
/// The number of radix bits, i.e., log2(bins)
RADIX_BITS = _RADIX_BITS,
};

/// The BlockLoad algorithm to use
static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;

/// Cache load modifier for reading keys (and values)
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;

static constexpr BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values)
static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use
/// The radix ranking algorithm to use
static constexpr RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM;

/// The BlockScan algorithm to use
static constexpr BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
};


@@ -99,15 +130,30 @@ struct AgentRadixSortDownsweepPolicy :


/**
* \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
* @brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in
* device-wide radix sort downsweep .
*
* @tparam AgentRadixSortDownsweepPolicy
* Parameterized AgentRadixSortDownsweepPolicy tuning policy type
*
* @tparam IS_DESCENDING
* Whether or not the sorted-order is high-to-low
*
* @tparam KeyT
* KeyT type
*
* @tparam ValueT
* ValueT type
*
* @tparam OffsetT
* Signed integer type for global offsets
*/
template <
typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
typename KeyT, ///< KeyT type
typename ValueT, ///< ValueT type
typename OffsetT, ///< Signed integer type for global offsets
typename DecomposerT = detail::identity_decomposer_t>
template <typename AgentRadixSortDownsweepPolicy,
bool IS_DESCENDING,
typename KeyT,
typename ValueT,
typename OffsetT,
typename DecomposerT = detail::identity_decomposer_t>
struct AgentRadixSortDownsweep
{
//---------------------------------------------------------------------
79 changes: 53 additions & 26 deletions cub/cub/agent/agent_radix_sort_upsweep.cuh
Original file line number Diff line number Diff line change
@@ -56,39 +56,63 @@ CUB_NAMESPACE_BEGIN
******************************************************************************/

/**
* Parameterizable tuning policy type for AgentRadixSortUpsweep
* @brief Parameterizable tuning policy type for AgentRadixSortUpsweep
*
* @tparam NOMINAL_BLOCK_THREADS_4B
* Threads per thread block
*
* @tparam NOMINAL_ITEMS_PER_THREAD_4B
* Items per thread (per tile of input)
*
* @tparam ComputeT
* Dominant compute type
*
* @tparam _LOAD_MODIFIER
* Cache load modifier for reading keys
*
* @tparam _RADIX_BITS
* The number of radix bits, i.e., log2(bins)
*/
template <
int NOMINAL_BLOCK_THREADS_4B, ///< Threads per thread block
int NOMINAL_ITEMS_PER_THREAD_4B, ///< Items per thread (per tile of input)
typename ComputeT, ///< Dominant compute type
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys
int _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
typename ScalingType = RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT> >
struct AgentRadixSortUpsweepPolicy :
ScalingType
template <int NOMINAL_BLOCK_THREADS_4B,
int NOMINAL_ITEMS_PER_THREAD_4B,
typename ComputeT,
CacheLoadModifier _LOAD_MODIFIER,
int _RADIX_BITS,
typename ScalingType =
RegBoundScaling<NOMINAL_BLOCK_THREADS_4B, NOMINAL_ITEMS_PER_THREAD_4B, ComputeT>>
struct AgentRadixSortUpsweepPolicy : ScalingType
{
enum
{
RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins)
};

static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys
enum
{
/// The number of radix bits, i.e., log2(bins)
RADIX_BITS = _RADIX_BITS,
};

/// Cache load modifier for reading keys
static constexpr CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;
};


/******************************************************************************
* Thread block abstractions
******************************************************************************/

/**
* \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
* @brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for
* participating in device-wide radix sort upsweep .
*
* @tparam AgentRadixSortUpsweepPolicy
* Parameterized AgentRadixSortUpsweepPolicy tuning policy type
*
* @tparam KeyT
* KeyT type
*
* @tparam DecomposerT = detail::identity_decomposer_t
* Signed integer type for global offsets
*/
template <
typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
typename KeyT, ///< KeyT type
typename OffsetT,
typename DecomposerT = detail::identity_decomposer_t> ///< Signed integer type for global offsets
template <typename AgentRadixSortUpsweepPolicy,
typename KeyT,
typename OffsetT,
typename DecomposerT = detail::identity_decomposer_t>
struct AgentRadixSortUpsweep
{

@@ -483,11 +507,14 @@ struct AgentRadixSortUpsweep


/**
* Extract counts
* @brief Extract counts
*
* @param[out] bin_count
* The exclusive prefix sum for the digits
* [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
*/
template <int BINS_TRACKED_PER_THREAD>
__device__ __forceinline__ void ExtractCounts(
OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
__device__ __forceinline__ void ExtractCounts(OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])
{
unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
unsigned int warp_tid = LaneId();
Loading

0 comments on commit 97dcbe1

Please sign in to comment.