Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow NVRTC to compile more of CUB #3951

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions cub/cub/agent/agent_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,9 @@
#include <cub/util_ptx.cuh>
#include <cub/util_type.cuh>

#include <cuda/std/cstdint>
#include <cuda/std/type_traits>

#include <cstdint>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@
#include <cuda/ptx>
#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@

#include <cuda/std/type_traits>

#include <iterator>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
3 changes: 0 additions & 3 deletions cub/cub/agent/agent_three_way_partition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@

#include <cuda/std/type_traits>

#include <iterator>
#include <type_traits>

CUB_NAMESPACE_BEGIN

/******************************************************************************
Expand Down
5 changes: 2 additions & 3 deletions cub/cub/block/block_run_length_decode.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@

#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h>

#include <limits>
#include <type_traits>
#include <cuda/std/limits>
#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN

Expand Down
2 changes: 0 additions & 2 deletions cub/cub/detail/choose_offset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,6 @@
#include <cuda/std/iterator>
#include <cuda/std/type_traits>

#include <cstdint>

CUB_NAMESPACE_BEGIN

namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/fast_modulo_division.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
#include <cub/util_type.cuh> // _CCCL_HAS_INT128()

#include <cuda/cmath> // cuda::std::ceil_div
#include <cuda/std/bit> // std::has_single_bit
#include <cuda/std/bit> // cuda::std::has_single_bit
#include <cuda/std/climits> // CHAR_BIT
#include <cuda/std/cstdint> // uint64_t
#include <cuda/std/limits> // numeric_limits
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/detail/mdspan_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

# include <cub/detail/fast_modulo_division.cuh> // fast_div_mod

# include <cuda/std/array> // std::array
# include <cuda/std/array> // cuda::std::array
# include <cuda/std/cstddef> // size_t
# include <cuda/std/mdspan>
# include <cuda/std/type_traits> // make_unsigned_t
Expand Down
3 changes: 2 additions & 1 deletion cub/cub/detail/nvtx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
// * C++14 is available for cuda::std::optional
// * NVTX3 uses module as an identifier, which trips up NVHPC
#if _CCCL_HAS_INCLUDE(<nvtx3/nvToolsExt.h>) && !defined(CCCL_DISABLE_NVTX) && !defined(NVTX_DISABLE) \
&& (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017)
&& (!_CCCL_COMPILER(NVHPC) || _CCCL_STD_VER == 2017) \
&& !_CCCL_COMPILER(NVRTC)
// Include our NVTX3 C++ wrapper if not available from the CTK
// TODO(bgruber): replace by a check for the first CTK version shipping the header
# if _CCCL_HAS_INCLUDE(<nvtx3/nvtx3.hpp>)
Expand Down
1 change: 0 additions & 1 deletion cub/cub/device/device_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@

#include <thrust/detail/raw_reference_cast.h>
#include <thrust/distance.h>
#include <thrust/system/cuda/detail/core/util.h>
#include <thrust/type_traits/is_contiguous_iterator.h>
#include <thrust/type_traits/unwrap_contiguous_iterator.h>

Expand Down
2 changes: 0 additions & 2 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,6 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <iterator>

CUB_NAMESPACE_BEGIN

namespace detail::adjacent_difference
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@

# include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

# include <cuda/std/__type_traits/integral_constant.h> // std::integral_constant
# include <cuda/std/__utility/integer_sequence.h> // std::index_sequence
# include <cuda/std/array> // std::array
# include <cuda/std/__type_traits/integral_constant.h> // cuda::std::integral_constant
# include <cuda/std/__utility/integer_sequence.h> // cuda::std::index_sequence
# include <cuda/std/array> // cuda::std::array
# include <cuda/std/cstddef> // size_t

# define _CUB_RETURN_IF_ERROR(STATUS) \
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_merge_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -234,9 +234,9 @@ struct DispatchMergeSort
* Merge sort supports large types, which can lead to excessive shared memory size requirements. In these cases,
* merge sort allocates virtual shared memory that resides in global memory.
*/
const std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock();
const std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock();
const std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);
const ::cuda::std::size_t block_sort_smem_size = num_tiles * vsmem_helper.BlockSortVSMemPerBlock();
const ::cuda::std::size_t merge_smem_size = num_tiles * vsmem_helper.MergeVSMemPerBlock();
const ::cuda::std::size_t virtual_shared_memory_size = (::cuda::std::max)(block_sort_smem_size, merge_smem_size);

void* allocations[4] = {nullptr, nullptr, nullptr, nullptr};
size_t allocation_sizes[4] = {
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@ struct DeviceSegmentedReduceKernelSource
InitT,
AccumT>)

CUB_RUNTIME_FUNCTION static constexpr std::size_t AccumSize()
CUB_RUNTIME_FUNCTION static constexpr cuda::std::size_t AccumSize()
{
return sizeof(AccumT);
}
Expand Down
8 changes: 6 additions & 2 deletions cub/cub/iterator/cache_modified_output_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@

#include <thrust/iterator/iterator_facade.h>

#include <iosfwd>
#if !_CCCL_COMPILER(NVRTC)
# include <iosfwd>
#endif // !_CCCL_COMPILER(NVRTC)

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -155,7 +157,7 @@ public:
*/
template <typename QualifiedValueType>
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE CacheModifiedOutputIterator(QualifiedValueType* ptr)
: ptr(const_cast<typename std::remove_cv<QualifiedValueType>::type*>(ptr))
: ptr(const_cast<typename ::cuda::std::remove_cv<QualifiedValueType>::type*>(ptr))
{}

/// Postfix increment
Expand Down Expand Up @@ -236,11 +238,13 @@ public:
return (ptr != rhs.ptr);
}

#if !_CCCL_COMPILER(NVRTC)
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
return os;
}
#endif // !_CCCL_COMPILER(NVRTC)
};

CUB_NAMESPACE_END
10 changes: 8 additions & 2 deletions cub/cub/iterator/tex_obj_input_iterator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@

#include <thrust/iterator/iterator_facade.h>

#include <ostream>

#include <nv/target>

#if !_CCCL_COMPILER(NVRTC)
# include <ostream>
#endif // !_CCCL_COMPILER(NVRTC)

CUB_NAMESPACE_BEGIN

/**
Expand Down Expand Up @@ -153,6 +155,7 @@ public:
, tex_obj(0)
{}

#if !_CCCL_COMPILER(NVRTC)
/**
* @brief Use this iterator to bind @p ptr with a texture reference
*
Expand Down Expand Up @@ -189,6 +192,7 @@ public:
{
return CubDebug(cudaDestroyTextureObject(tex_obj));
}
#endif // !_CCCL_COMPILER(NVRTC)

/// Postfix increment
_CCCL_HOST_DEVICE _CCCL_FORCEINLINE self_type operator++(int)
Expand Down Expand Up @@ -281,13 +285,15 @@ public:
return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
}

#if !_CCCL_COMPILER(NVRTC)
/// ostream operator
friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
{
os << "cub::TexObjInputIterator( ptr=" << itr.ptr << ", offset=" << itr.tex_offset << ", tex_obj=" << itr.tex_obj
<< " )";
return os;
}
#endif // !_CCCL_COMPILER(NVRTC)

private:
// This is hoisted out of operator* because #pragma can't be used inside of
Expand Down
2 changes: 0 additions & 2 deletions cub/cub/thread/thread_search.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@
#include <cub/util_namespace.cuh>
#include <cub/util_type.cuh>

#include <iterator>

#include <nv/target>

CUB_NAMESPACE_BEGIN
Expand Down
59 changes: 53 additions & 6 deletions cub/test/catch2_test_nvrtc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,27 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
nvrtcProgram prog{};

const char* src = R"asdf(
#include <cub/warp/warp_reduce.cuh>
#include <cub/warp/warp_scan.cuh>
#include <cub/warp/warp_exchange.cuh>
#include <cub/warp/warp_load.cuh>
#include <cub/warp/warp_store.cuh>
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/agent/agent_adjacent_difference.cuh>
#include <cub/agent/agent_batch_memcpy.cuh>
#include <cub/agent/agent_for.cuh>
#include <cub/agent/agent_histogram.cuh>
#include <cub/agent/agent_merge.cuh>
#include <cub/agent/agent_merge_sort.cuh>
#include <cub/agent/agent_radix_sort_downsweep.cuh>
#include <cub/agent/agent_radix_sort_histogram.cuh>
#include <cub/agent/agent_radix_sort_onesweep.cuh>
#include <cub/agent/agent_radix_sort_upsweep.cuh>
#include <cub/agent/agent_reduce_by_key.cuh>
#include <cub/agent/agent_reduce.cuh>
#include <cub/agent/agent_rle.cuh>
#include <cub/agent/agent_scan_by_key.cuh>
#include <cub/agent/agent_scan.cuh>
#include <cub/agent/agent_segmented_radix_sort.cuh>
#include <cub/agent/agent_select_if.cuh>
#include <cub/agent/agent_sub_warp_merge_sort.cuh>
#include <cub/agent/agent_three_way_partition.cuh>
#include <cub/agent/agent_unique_by_key.cuh>
#include <cub/agent/single_pass_scan_operators.cuh>
#include <cub/block/block_adjacent_difference.cuh>
#include <cub/block/block_discontinuity.cuh>
#include <cub/block/block_exchange.cuh>
Expand All @@ -53,8 +68,13 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
#include <cub/block/block_merge_sort.cuh>
#include <cub/block/block_radix_rank.cuh>
#include <cub/block/block_radix_sort.cuh>
#include <cub/block/block_raking_layout.cuh>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_run_length_decode.cuh>
#include <cub/block/block_scan.cuh>
#include <cub/block/block_shuffle.cuh>
#include <cub/block/block_store.cuh>
#include <cub/block/radix_rank_sort_operations.cuh>
#include <cub/device/dispatch/kernels/reduce.cuh>
#include <cub/device/dispatch/kernels/for_each.cuh>
#include <cub/device/dispatch/kernels/scan.cuh>
Expand All @@ -63,7 +83,34 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
#include <cub/device/dispatch/kernels/radix_sort.cuh>
#include <cub/device/dispatch/kernels/unique_by_key.cuh>
#include <cub/device/dispatch/kernels/transform.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/iterator/cache_modified_input_iterator.cuh>
#include <cub/iterator/cache_modified_output_iterator.cuh>
#include <cub/iterator/tex_obj_input_iterator.cuh>
#include <cub/thread/thread_load.cuh>
#include <cub/thread/thread_operators.cuh>
#include <cub/thread/thread_reduce.cuh>
#include <cub/thread/thread_scan.cuh>
#include <cub/thread/thread_sort.cuh>
#include <cub/thread/thread_store.cuh>
#include <cub/warp/warp_reduce.cuh>
#include <cub/warp/warp_scan.cuh>
#include <cub/warp/warp_exchange.cuh>
#include <cub/warp/warp_load.cuh>
#include <cub/warp/warp_store.cuh>
#include <cub/warp/warp_merge_sort.cuh>
#include <cub/util_arch.cuh>
#include <cub/util_cpp_dialect.cuh>
#include <cub/util_debug.cuh>
#include <cub/util_device.cuh>
#include <cub/util_macro.cuh>
#include <cub/util_math.cuh>
#include <cub/util_namespace.cuh>
#include <cub/util_policy_wrapper_t.cuh>
#include <cub/util_ptx.cuh>
#include <cub/util_temporary_storage.cuh>
#include <cub/util_type.cuh>
#include <cub/util_vsmem.cuh>

#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
Expand Down
Loading
Loading