Skip to content

Commit

Permalink
Move implementation of _LIBCUDACXX_TEMPLATE to CCCL (#2832)
Browse files Browse the repository at this point in the history
* Move implementation of `_LIBCUDACXX_TEMPLATE` to CCCL

We have emulation for concepts in LIBCUDACXX that was guarded behind C++14

But there is nothing that requires C++14 for just the template headers and we want to use them universally throughout the codebase

Consequently move them to CCCL proper and enable them unconditionally. To ensure that we do not add any hidden dependencies this also adds a barebones implementation of `enable_if_t` and a trailing `enable_if_t`
  • Loading branch information
miscco authored Nov 21, 2024
1 parent d80ab66 commit a39a8a7
Show file tree
Hide file tree
Showing 133 changed files with 1,875 additions and 2,077 deletions.
4 changes: 2 additions & 2 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ IndentWidth: 2
KeepEmptyLinesAtTheStartOfBlocks: false
MaxEmptyLinesToKeep: 1
Macros:
- _LIBCUDACXX_TEMPLATE(...)=template<...>
- _LIBCUDACXX_REQUIRES(...)=requires (...)
- _CCCL_TEMPLATE(...)=template<...>
- _CCCL_REQUIRES(...)=requires (...)
WhitespaceSensitiveMacros:
- _CCCL_HAS_INCLUDE
NamespaceIndentation: None
Expand Down
3 changes: 1 addition & 2 deletions cub/cub/detail/type_traits.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,10 @@ _CCCL_SUPPRESS_DEPRECATED_POP
#if __cccl_lib_mdspan
# include <cuda/std/mdspan>
#endif // __cccl_lib_mdspan
#include <cuda/std/__concepts/concept_macros.h> // IWYU pragma: keep
#include <cuda/std/span>
#include <cuda/std/type_traits>

#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::enable_if_t<(__VA_ARGS__)>* = nullptr

CUB_NAMESPACE_BEGIN
namespace detail
{
Expand Down
24 changes: 8 additions & 16 deletions cub/cub/device/dispatch/kernels/for_each_in_extents_kernel.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,8 @@ namespace detail
namespace for_each_in_extents
{

template <int Rank,
typename ExtendType,
typename FastDivModType,
_CUB_TEMPLATE_REQUIRES(ExtendType::static_extent(Rank) != ::cuda::std::dynamic_extent)>
_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
_CCCL_REQUIRES((ExtendType::static_extent(Rank) != ::cuda::std::dynamic_extent))
_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModType extent_size)
{
using extent_index_type = typename ExtendType::index_type;
Expand All @@ -67,28 +65,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModTy
return static_cast<unsigned_index_type>(ext.static_extent(Rank));
}

template <int Rank,
typename ExtendType,
typename FastDivModType,
_CUB_TEMPLATE_REQUIRES(ExtendType::static_extent(Rank) == ::cuda::std::dynamic_extent)>
_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
_CCCL_REQUIRES((ExtendType::static_extent(Rank) == ::cuda::std::dynamic_extent))
_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModType extent_size)
{
return extent_size;
}

template <int Rank,
typename ExtendType,
typename FastDivModType,
_CUB_TEMPLATE_REQUIRES(cub::detail::is_sub_size_static<Rank + 1, ExtendType>())>
_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
_CCCL_REQUIRES((cub::detail::is_sub_size_static<Rank + 1, ExtendType>()))
_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtendType ext, FastDivModType extents_sub_size)
{
return sub_size<Rank + 1>(ext);
}

template <int Rank,
typename ExtendType,
typename FastDivModType,
_CUB_TEMPLATE_REQUIRES(!cub::detail::is_sub_size_static<Rank + 1, ExtendType>())>
_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
_CCCL_REQUIRES((!cub::detail::is_sub_size_static<Rank + 1, ExtendType>()))
_CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtendType ext, FastDivModType extents_sub_size)
{
return extents_sub_size;
Expand Down
25 changes: 12 additions & 13 deletions cub/cub/thread/thread_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -471,19 +471,17 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
**********************************************************************************************************************/

// never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators
template <typename Input,
typename ReductionOp,
_CUB_TEMPLATE_REQUIRES(!cub::internal::enable_generic_simd_reduction<Input, ReductionOp>())>
_CCCL_TEMPLATE(typename Input, typename ReductionOp)
_CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
{
assert(false);
return input[0];
}

template <typename Input,
typename ReductionOp,
_CUB_TEMPLATE_REQUIRES(cub::internal::enable_generic_simd_reduction<Input, ReductionOp>())>
_CCCL_TEMPLATE(typename Input, typename ReductionOp)
_CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
{
Expand Down Expand Up @@ -675,12 +673,12 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* inpu
*
* @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T, PrefixT></tt>
*/
template <int Length,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>,
_CUB_TEMPLATE_REQUIRES(Length > 0)>
_CCCL_TEMPLATE(int Length,
typename T,
typename ReductionOp,
typename PrefixT,
typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>)
_CCCL_REQUIRES((Length > 0))
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
{
Expand All @@ -690,7 +688,8 @@ ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
return cub::ThreadReduce(*array, reduction_op, prefix);
}

template <int Length, typename T, typename ReductionOp, typename PrefixT, _CUB_TEMPLATE_REQUIRES(Length == 0)>
_CCCL_TEMPLATE(int Length, typename T, typename ReductionOp, typename PrefixT)
_CCCL_REQUIRES((Length == 0))
_CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix)
{
return prefix;
Expand Down
20 changes: 8 additions & 12 deletions cub/test/catch2_test_device_for_each_in_extents.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,28 +50,24 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachInExtents, device_for_each_in_exte
* Host reference
**********************************************************************************************************************/

template <int Rank = 0,
typename T,
typename ExtentType,
_CUB_TEMPLATE_REQUIRES(Rank == ExtentType::rank()),
typename... IndicesType>
static void fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType&, size_t& pos, IndicesType... indices)
template <int Rank = 0, typename T, typename ExtentType, typename... IndicesType>
static auto fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType&, size_t& pos, IndicesType... indices)
_CCCL_TRAILING_REQUIRES(void)((Rank == ExtentType::rank()))
{
vector[pos++] = {indices...};
return void(); // Intel and nvc++ require a return statement
}

template <int Rank = 0,
typename T,
typename ExtentType,
_CUB_TEMPLATE_REQUIRES(Rank < ExtentType::rank()),
typename... IndicesType>
static void fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType& ext, size_t& pos, IndicesType... indices)
template <int Rank = 0, typename T, typename ExtentType, typename... IndicesType>
static auto fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType& ext, size_t& pos, IndicesType... indices)
_CCCL_TRAILING_REQUIRES(void)((Rank < ExtentType::rank()))
{
using IndexType = typename ExtentType::index_type;
for (IndexType i = 0; i < ext.extent(Rank); ++i)
{
fill_linear_impl<Rank + 1>(vector, ext, pos, indices..., i);
}
return void(); // Intel and nvc++ require a return statement
}

template <typename T, typename IndexType, size_t... Extents>
Expand Down
6 changes: 4 additions & 2 deletions cub/test/thread_reduce/catch2_test_thread_reduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -281,13 +281,15 @@ using cub_operator_fp_list =
* Verify results and kernel launch
**********************************************************************************************************************/

template <typename T, _CUB_TEMPLATE_REQUIRES(::cuda::std::is_floating_point<T>::value)>
_CCCL_TEMPLATE(typename T)
_CCCL_REQUIRES((::cuda::std::is_floating_point<T>::value))
void verify_results(const T& expected_data, const T& test_results)
{
REQUIRE(expected_data == Approx(test_results).epsilon(0.05));
}

template <typename T, _CUB_TEMPLATE_REQUIRES(!::cuda::std::is_floating_point<T>::value)>
_CCCL_TEMPLATE(typename T)
_CCCL_REQUIRES((!::cuda::std::is_floating_point<T>::value))
void verify_results(const T& expected_data, const T& test_results)
{
REQUIRE(expected_data == test_results);
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__algorithm/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__concepts/__concept_macros.h>
#include <cuda/std/__concepts/concept_macros.h>

#include <cuda/experimental/__algorithm/common.cuh>
#include <cuda/experimental/__stream/stream_ref.cuh>
Expand Down Expand Up @@ -62,8 +62,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
//! @param __stream Stream that the copy should be inserted into
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_LIBCUDACXX_TEMPLATE(typename _SrcTy, typename _DstTy)
_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_SrcTy> _LIBCUDACXX_AND __valid_copy_fill_argument<_DstTy>)
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
__copy_bytes_impl(
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__algorithm/fill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/std/__concepts/__concept_macros.h>
#include <cuda/std/__concepts/concept_macros.h>

#include <cuda/experimental/__algorithm/common.cuh>
#include <cuda/experimental/__stream/stream_ref.cuh>
Expand Down Expand Up @@ -49,8 +49,8 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_LIBCUDACXX_TEMPLATE(typename _DstTy)
_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_DstTy>)
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
__fill_bytes_impl(__stream,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ private:
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
__cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
// TODO add auto synchronization
Expand All @@ -129,7 +129,7 @@ private:
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
__cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
// TODO add auto synchronization
Expand Down Expand Up @@ -173,8 +173,8 @@ public:
//! @brief Move-constructs a \c uninitialized_async_buffer from \p __other
//! @param __other Another \c uninitialized_async_buffer with matching properties
//! Takes ownership of the allocation in \p __other and resets it
_LIBCUDACXX_TEMPLATE(class... _OtherProperties)
_LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_TEMPLATE(class... _OtherProperties)
_CCCL_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_HIDE_FROM_ABI uninitialized_async_buffer(uninitialized_async_buffer<_Tp, _OtherProperties...>&& __other) noexcept
: __mr_(_CUDA_VSTD::move(__other.__mr_))
, __stream_(_CUDA_VSTD::exchange(__other.__stream_, {}))
Expand Down Expand Up @@ -275,9 +275,8 @@ public:

# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
//! @brief Forwards the passed properties
_LIBCUDACXX_TEMPLATE(class _Property)
_LIBCUDACXX_REQUIRES(
(!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
_CCCL_TEMPLATE(class _Property)
_CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
_CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {}
# endif // DOXYGEN_SHOULD_SKIP_THIS

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ private:
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
__cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
return {__self.__get_data(), __self.size()};
Expand All @@ -118,7 +118,7 @@ private:
template <class _Tp2 = _Tp>
_CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
__cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
_LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
_CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
{
return {__self.__get_data(), __self.size()};
Expand Down Expand Up @@ -158,8 +158,8 @@ public:
//! @brief Move-constructs a \c uninitialized_buffer from another \c uninitialized_buffer with matching properties
//! @param __other Another \c uninitialized_buffer
//! Takes ownership of the allocation in \p __other and resets it
_LIBCUDACXX_TEMPLATE(class... _OtherProperties)
_LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_TEMPLATE(class... _OtherProperties)
_CCCL_REQUIRES(__properties_match<_OtherProperties...>)
_CCCL_HIDE_FROM_ABI uninitialized_buffer(uninitialized_buffer<_Tp, _OtherProperties...>&& __other) noexcept
: __mr_(_CUDA_VSTD::move(__other.__mr_))
, __count_(_CUDA_VSTD::exchange(__other.__count_, 0))
Expand Down Expand Up @@ -240,9 +240,8 @@ public:

# ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
//! @brief Forwards the passed Properties
_LIBCUDACXX_TEMPLATE(class _Property)
_LIBCUDACXX_REQUIRES(
(!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
_CCCL_TEMPLATE(class _Property)
_CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
_CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_buffer&, _Property) noexcept {}
# endif // DOXYGEN_SHOULD_SKIP_THIS

Expand Down
Loading

0 comments on commit a39a8a7

Please sign in to comment.