Move implementation of _LIBCUDACXX_TEMPLATE to CCCL (#2832)

* Move implementation of `_LIBCUDACXX_TEMPLATE` to CCCL We have emulation for concepts in LIBCUDACXX that was guarded behind C++14 But there is nothing that requires C++14 for just the template headers and we want to use them universally throughout the codebase Consequently move them to CCCL proper and enable them unconditionally. To ensure that we do not add any hidden dependencies this also adds a barebones implementation of `enable_if_t` and a trailing `enable_if_t`
NVIDIA · Nov 21, 2024 · a39a8a7 · a39a8a7
1 parent d80ab66
commit a39a8a7
Show file tree

Hide file tree

Showing 133 changed files with 1,875 additions and 2,077 deletions.
diff --git a/.clang-format b/.clang-format
@@ -128,8 +128,8 @@ IndentWidth: 2
 KeepEmptyLinesAtTheStartOfBlocks: false
 MaxEmptyLinesToKeep: 1
 Macros:
-- _LIBCUDACXX_TEMPLATE(...)=template<...>
-- _LIBCUDACXX_REQUIRES(...)=requires (...)
+- _CCCL_TEMPLATE(...)=template<...>
+- _CCCL_REQUIRES(...)=requires (...)
 WhitespaceSensitiveMacros:
 - _CCCL_HAS_INCLUDE
 NamespaceIndentation: None

diff --git a/cub/cub/detail/type_traits.cuh b/cub/cub/detail/type_traits.cuh
@@ -52,11 +52,10 @@ _CCCL_SUPPRESS_DEPRECATED_POP
 #if __cccl_lib_mdspan
 #  include <cuda/std/mdspan>
 #endif // __cccl_lib_mdspan
+#include <cuda/std/__concepts/concept_macros.h> // IWYU pragma: keep
 #include <cuda/std/span>
 #include <cuda/std/type_traits>
 
-#define _CUB_TEMPLATE_REQUIRES(...) ::cuda::std::enable_if_t<(__VA_ARGS__)>* = nullptr
-
 CUB_NAMESPACE_BEGIN
 namespace detail
 {

diff --git a/cub/cub/device/dispatch/kernels/for_each_in_extents_kernel.cuh b/cub/cub/device/dispatch/kernels/for_each_in_extents_kernel.cuh
@@ -55,10 +55,8 @@ namespace detail
 namespace for_each_in_extents
 {
 
-template <int Rank,
-          typename ExtendType,
-          typename FastDivModType,
-          _CUB_TEMPLATE_REQUIRES(ExtendType::static_extent(Rank) != ::cuda::std::dynamic_extent)>
+_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
+_CCCL_REQUIRES((ExtendType::static_extent(Rank) != ::cuda::std::dynamic_extent))
 _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModType extent_size)
 {
   using extent_index_type   = typename ExtendType::index_type;
@@ -67,28 +65,22 @@ _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModTy
   return static_cast<unsigned_index_type>(ext.static_extent(Rank));
 }
 
-template <int Rank,
-          typename ExtendType,
-          typename FastDivModType,
-          _CUB_TEMPLATE_REQUIRES(ExtendType::static_extent(Rank) == ::cuda::std::dynamic_extent)>
+_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
+_CCCL_REQUIRES((ExtendType::static_extent(Rank) == ::cuda::std::dynamic_extent))
 _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extent_size(ExtendType ext, FastDivModType extent_size)
 {
   return extent_size;
 }
 
-template <int Rank,
-          typename ExtendType,
-          typename FastDivModType,
-          _CUB_TEMPLATE_REQUIRES(cub::detail::is_sub_size_static<Rank + 1, ExtendType>())>
+_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
+_CCCL_REQUIRES((cub::detail::is_sub_size_static<Rank + 1, ExtendType>()))
 _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtendType ext, FastDivModType extents_sub_size)
 {
   return sub_size<Rank + 1>(ext);
 }
 
-template <int Rank,
-          typename ExtendType,
-          typename FastDivModType,
-          _CUB_TEMPLATE_REQUIRES(!cub::detail::is_sub_size_static<Rank + 1, ExtendType>())>
+_CCCL_TEMPLATE(int Rank, typename ExtendType, typename FastDivModType)
+_CCCL_REQUIRES((!cub::detail::is_sub_size_static<Rank + 1, ExtendType>()))
 _CCCL_DEVICE _CCCL_FORCEINLINE auto get_extents_sub_size(ExtendType ext, FastDivModType extents_sub_size)
 {
   return extents_sub_size;

diff --git a/cub/cub/thread/thread_reduce.cuh b/cub/cub/thread/thread_reduce.cuh
@@ -471,19 +471,17 @@ ThreadReduceTernaryTree(const Input& input, ReductionOp reduction_op)
  **********************************************************************************************************************/
 
 // never reached. Protect instantion of ThreadReduceSimd with arbitrary types and operators
-template <typename Input,
-          typename ReductionOp,
-          _CUB_TEMPLATE_REQUIRES(!cub::internal::enable_generic_simd_reduction<Input, ReductionOp>())>
+_CCCL_TEMPLATE(typename Input, typename ReductionOp)
+_CCCL_REQUIRES((!cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
 _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
 ThreadReduceSimd(const Input& input, ReductionOp) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
 {
   assert(false);
   return input[0];
 }
 
-template <typename Input,
-          typename ReductionOp,
-          _CUB_TEMPLATE_REQUIRES(cub::internal::enable_generic_simd_reduction<Input, ReductionOp>())>
+_CCCL_TEMPLATE(typename Input, typename ReductionOp)
+_CCCL_REQUIRES((cub::internal::enable_generic_simd_reduction<Input, ReductionOp>()))
 _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE auto
 ThreadReduceSimd(const Input& input, ReductionOp reduction_op) -> ::cuda::std::remove_cvref_t<decltype(input[0])>
 {
@@ -675,12 +673,12 @@ _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT ThreadReduce(const T* inpu
  *
  * @return Aggregate of type <tt>cuda::std::__accumulator_t<ReductionOp, T, PrefixT></tt>
  */
-template <int Length,
-          typename T,
-          typename ReductionOp,
-          typename PrefixT,
-          typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>,
-          _CUB_TEMPLATE_REQUIRES(Length > 0)>
+_CCCL_TEMPLATE(int Length,
+               typename T,
+               typename ReductionOp,
+               typename PrefixT,
+               typename AccumT = ::cuda::std::__accumulator_t<ReductionOp, T, PrefixT>)
+_CCCL_REQUIRES((Length > 0))
 _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE AccumT
 ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
 {
@@ -690,7 +688,8 @@ ThreadReduce(const T* input, ReductionOp reduction_op, PrefixT prefix)
   return cub::ThreadReduce(*array, reduction_op, prefix);
 }
 
-template <int Length, typename T, typename ReductionOp, typename PrefixT, _CUB_TEMPLATE_REQUIRES(Length == 0)>
+_CCCL_TEMPLATE(int Length, typename T, typename ReductionOp, typename PrefixT)
+_CCCL_REQUIRES((Length == 0))
 _CCCL_NODISCARD _CCCL_DEVICE _CCCL_FORCEINLINE T ThreadReduce(const T*, ReductionOp, PrefixT prefix)
 {
   return prefix;

diff --git a/cub/test/catch2_test_device_for_each_in_extents.cu b/cub/test/catch2_test_device_for_each_in_extents.cu
@@ -50,28 +50,24 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceFor::ForEachInExtents, device_for_each_in_exte
  * Host reference
  **********************************************************************************************************************/
 
-template <int Rank = 0,
-          typename T,
-          typename ExtentType,
-          _CUB_TEMPLATE_REQUIRES(Rank == ExtentType::rank()),
-          typename... IndicesType>
-static void fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType&, size_t& pos, IndicesType... indices)
+template <int Rank = 0, typename T, typename ExtentType, typename... IndicesType>
+static auto fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType&, size_t& pos, IndicesType... indices)
+  _CCCL_TRAILING_REQUIRES(void)((Rank == ExtentType::rank()))
 {
   vector[pos++] = {indices...};
+  return void(); // Intel and nvc++ require a return statement
 }
 
-template <int Rank = 0,
-          typename T,
-          typename ExtentType,
-          _CUB_TEMPLATE_REQUIRES(Rank < ExtentType::rank()),
-          typename... IndicesType>
-static void fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType& ext, size_t& pos, IndicesType... indices)
+template <int Rank = 0, typename T, typename ExtentType, typename... IndicesType>
+static auto fill_linear_impl(c2h::host_vector<T>& vector, const ExtentType& ext, size_t& pos, IndicesType... indices)
+  _CCCL_TRAILING_REQUIRES(void)((Rank < ExtentType::rank()))
 {
   using IndexType = typename ExtentType::index_type;
   for (IndexType i = 0; i < ext.extent(Rank); ++i)
   {
     fill_linear_impl<Rank + 1>(vector, ext, pos, indices..., i);
   }
+  return void(); // Intel and nvc++ require a return statement
 }
 
 template <typename T, typename IndexType, size_t... Extents>

diff --git a/cub/test/thread_reduce/catch2_test_thread_reduce.cu b/cub/test/thread_reduce/catch2_test_thread_reduce.cu
@@ -281,13 +281,15 @@ using cub_operator_fp_list =
  * Verify results and kernel launch
  **********************************************************************************************************************/
 
-template <typename T, _CUB_TEMPLATE_REQUIRES(::cuda::std::is_floating_point<T>::value)>
+_CCCL_TEMPLATE(typename T)
+_CCCL_REQUIRES((::cuda::std::is_floating_point<T>::value))
 void verify_results(const T& expected_data, const T& test_results)
 {
   REQUIRE(expected_data == Approx(test_results).epsilon(0.05));
 }
 
-template <typename T, _CUB_TEMPLATE_REQUIRES(!::cuda::std::is_floating_point<T>::value)>
+_CCCL_TEMPLATE(typename T)
+_CCCL_REQUIRES((!::cuda::std::is_floating_point<T>::value))
 void verify_results(const T& expected_data, const T& test_results)
 {
   REQUIRE(expected_data == test_results);

diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__concepts/__concept_macros.h>
+#include <cuda/std/__concepts/concept_macros.h>
 
 #include <cuda/experimental/__algorithm/common.cuh>
 #include <cuda/experimental/__stream/stream_ref.cuh>
@@ -62,8 +62,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 //! @param __stream Stream that the copy should be inserted into
 //! @param __src Source to copy from
 //! @param __dst Destination to copy into
-_LIBCUDACXX_TEMPLATE(typename _SrcTy, typename _DstTy)
-_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_SrcTy> _LIBCUDACXX_AND __valid_copy_fill_argument<_DstTy>)
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
 void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
 {
   __copy_bytes_impl(

diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh
@@ -21,7 +21,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__concepts/__concept_macros.h>
+#include <cuda/std/__concepts/concept_macros.h>
 
 #include <cuda/experimental/__algorithm/common.cuh>
 #include <cuda/experimental/__stream/stream_ref.cuh>
@@ -49,8 +49,8 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
 //! @param __stream Stream that the copy should be inserted into
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
-_LIBCUDACXX_TEMPLATE(typename _DstTy)
-_LIBCUDACXX_REQUIRES(__valid_copy_fill_argument<_DstTy>)
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
 void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
 {
   __fill_bytes_impl(__stream,

diff --git a/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_async_buffer.cuh
@@ -117,7 +117,7 @@ private:
   template <class _Tp2 = _Tp>
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, uninitialized_async_buffer& __self) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
+    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
       _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
     // TODO add auto synchronization
@@ -129,7 +129,7 @@ private:
   template <class _Tp2 = _Tp>
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, const uninitialized_async_buffer& __self) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
+    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
       _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
     // TODO add auto synchronization
@@ -173,8 +173,8 @@ public:
   //! @brief Move-constructs a \c uninitialized_async_buffer from \p __other
   //! @param __other Another \c uninitialized_async_buffer with matching properties
   //! Takes ownership of the allocation in \p __other and resets it
-  _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
-  _LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
+  _CCCL_TEMPLATE(class... _OtherProperties)
+  _CCCL_REQUIRES(__properties_match<_OtherProperties...>)
   _CCCL_HIDE_FROM_ABI uninitialized_async_buffer(uninitialized_async_buffer<_Tp, _OtherProperties...>&& __other) noexcept
       : __mr_(_CUDA_VSTD::move(__other.__mr_))
       , __stream_(_CUDA_VSTD::exchange(__other.__stream_, {}))
@@ -275,9 +275,8 @@ public:
 
 #  ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
   //! @brief Forwards the passed properties
-  _LIBCUDACXX_TEMPLATE(class _Property)
-  _LIBCUDACXX_REQUIRES(
-    (!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
+  _CCCL_TEMPLATE(class _Property)
+  _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
   _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_async_buffer&, _Property) noexcept {}
 #  endif // DOXYGEN_SHOULD_SKIP_THIS
 

diff --git a/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh b/cudax/include/cuda/experimental/__container/uninitialized_buffer.cuh
@@ -107,7 +107,7 @@ private:
   template <class _Tp2 = _Tp>
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, uninitialized_buffer& __self) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
+    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<_Tp>)(
       _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
     return {__self.__get_data(), __self.size()};
@@ -118,7 +118,7 @@ private:
   template <class _Tp2 = _Tp>
   _CCCL_NODISCARD_FRIEND _CCCL_HIDE_FROM_ABI auto
   __cudax_launch_transform(::cuda::stream_ref, const uninitialized_buffer& __self) noexcept
-    _LIBCUDACXX_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
+    _CCCL_TRAILING_REQUIRES(_CUDA_VSTD::span<const _Tp>)(
       _CUDA_VSTD::same_as<_Tp, _Tp2>&& _CUDA_VSTD::__is_included_in_v<_CUDA_VMR::device_accessible, _Properties...>)
   {
     return {__self.__get_data(), __self.size()};
@@ -158,8 +158,8 @@ public:
   //! @brief Move-constructs a \c uninitialized_buffer from another \c uninitialized_buffer with matching properties
   //! @param __other Another \c uninitialized_buffer
   //! Takes ownership of the allocation in \p __other and resets it
-  _LIBCUDACXX_TEMPLATE(class... _OtherProperties)
-  _LIBCUDACXX_REQUIRES(__properties_match<_OtherProperties...>)
+  _CCCL_TEMPLATE(class... _OtherProperties)
+  _CCCL_REQUIRES(__properties_match<_OtherProperties...>)
   _CCCL_HIDE_FROM_ABI uninitialized_buffer(uninitialized_buffer<_Tp, _OtherProperties...>&& __other) noexcept
       : __mr_(_CUDA_VSTD::move(__other.__mr_))
       , __count_(_CUDA_VSTD::exchange(__other.__count_, 0))
@@ -240,9 +240,8 @@ public:
 
 #  ifndef DOXYGEN_SHOULD_SKIP_THIS // friend functions are currently broken
   //! @brief Forwards the passed Properties
-  _LIBCUDACXX_TEMPLATE(class _Property)
-  _LIBCUDACXX_REQUIRES(
-    (!property_with_value<_Property>) _LIBCUDACXX_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
+  _CCCL_TEMPLATE(class _Property)
+  _CCCL_REQUIRES((!property_with_value<_Property>) _CCCL_AND _CUDA_VSTD::__is_included_in_v<_Property, _Properties...>)
   _CCCL_HIDE_FROM_ABI friend constexpr void get_property(const uninitialized_buffer&, _Property) noexcept {}
 #  endif // DOXYGEN_SHOULD_SKIP_THIS